howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing as mp
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26import cyvcf2
   27import pyBigWig
   28import math
   29
   30from howard.functions.commons import *
   31from howard.objects.database import *
   32from howard.functions.databases import *
   33from howard.functions.utils import *
   34
   35
   36class Variants:
   37
   38    def __init__(
   39        self,
   40        conn=None,
   41        input: str = None,
   42        output: str = None,
   43        config: dict = {},
   44        param: dict = {},
   45        load: bool = False,
   46    ) -> None:
   47        """
   48        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   49        header
   50
   51        :param conn: the connection to the database
   52        :param input: the input file
   53        :param output: the output file
   54        :param config: a dictionary containing the configuration of the model
   55        :param param: a dictionary containing the parameters of the model
   56        """
   57
   58        # Init variables
   59        self.init_variables()
   60
   61        # Input
   62        self.set_input(input)
   63
   64        # Config
   65        self.set_config(config)
   66
   67        # Param
   68        self.set_param(param)
   69
   70        # Output
   71        self.set_output(output)
   72
   73        # connexion
   74        self.set_connexion(conn)
   75
   76        # Header
   77        self.set_header()
   78
   79        # Samples
   80        self.set_samples()
   81
   82        # Load data
   83        if load:
   84            self.load_data()
   85
   86    def set_samples(self, samples: list = None) -> list:
   87        """
   88        The function `set_samples` sets the samples attribute of an object to a provided list or
   89        retrieves it from a parameter dictionary.
   90
   91        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   92        input and sets the `samples` attribute of the class to the provided list. If no samples are
   93        provided, it tries to get the samples from the class's parameters using the `get_param` method
   94        :type samples: list
   95        :return: The `samples` list is being returned.
   96        """
   97
   98        if not samples:
   99            samples = self.get_param().get("samples", {}).get("list", None)
  100
  101        self.samples = samples
  102
  103        return samples
  104
  105    def get_samples(self) -> list:
  106        """
  107        This function returns a list of samples.
  108        :return: The `get_samples` method is returning the `samples` attribute of the object.
  109        """
  110
  111        return self.samples
  112
  113    def get_samples_check(self) -> bool:
  114        """
  115        This function returns the value of the "check" key within the "samples" dictionary retrieved
  116        from the parameters.
  117        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  118        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  119        method. If the key "check" is not found, it will return `False`.
  120        """
  121
  122        return self.get_param().get("samples", {}).get("check", True)
  123
  124    def set_input(self, input: str = None) -> None:
  125        """
  126        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  127        attributes in the class accordingly.
  128
  129        :param input: The `set_input` method in the provided code snippet is used to set attributes
  130        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  131        :type input: str
  132        """
  133
  134        if input and not isinstance(input, str):
  135            try:
  136                self.input = input.name
  137            except:
  138                log.error(f"Input file '{input} in bad format")
  139                raise ValueError(f"Input file '{input} in bad format")
  140        else:
  141            self.input = input
  142
  143        # Input format
  144        if input:
  145            input_name, input_extension = os.path.splitext(self.input)
  146            self.input_name = input_name
  147            self.input_extension = input_extension
  148            self.input_format = self.input_extension.replace(".", "")
  149
  150    def set_config(self, config: dict) -> None:
  151        """
  152        The set_config function takes a config object and assigns it as the configuration object for the
  153        class.
  154
  155        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  156        contains configuration settings for the class. When you call the `set_config` function with a
  157        dictionary object as the argument, it will set that dictionary as the configuration object for
  158        the class
  159        :type config: dict
  160        """
  161
  162        self.config = config
  163
  164    def set_param(self, param: dict) -> None:
  165        """
  166        This function sets a parameter object for the class based on the input dictionary.
  167
  168        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  169        as the `param` attribute of the class instance
  170        :type param: dict
  171        """
  172
  173        self.param = param
  174
  175    def init_variables(self) -> None:
  176        """
  177        This function initializes the variables that will be used in the rest of the class
  178        """
  179
  180        self.prefix = "howard"
  181        self.table_variants = "variants"
  182        self.dataframe = None
  183
  184        self.comparison_map = {
  185            "gt": ">",
  186            "gte": ">=",
  187            "lt": "<",
  188            "lte": "<=",
  189            "equals": "=",
  190            "contains": "SIMILAR TO",
  191        }
  192
  193        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  194
  195        self.code_type_map_to_sql = {
  196            "Integer": "INTEGER",
  197            "String": "VARCHAR",
  198            "Float": "FLOAT",
  199            "Flag": "VARCHAR",
  200        }
  201
  202        self.index_additionnal_fields = []
  203
  204    def get_indexing(self) -> bool:
  205        """
  206        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  207        returns False.
  208        :return: The value of the indexing parameter.
  209        """
  210
  211        return self.get_param().get("indexing", False)
  212
  213    def get_connexion_config(self) -> dict:
  214        """
  215        The function `get_connexion_config` returns a dictionary containing the configuration for a
  216        connection, including the number of threads and memory limit.
  217        :return: a dictionary containing the configuration for the Connexion library.
  218        """
  219
  220        # config
  221        config = self.get_config()
  222
  223        # Connexion config
  224        connexion_config = {}
  225        threads = self.get_threads()
  226
  227        # Threads
  228        if threads:
  229            connexion_config["threads"] = threads
  230
  231        # Memory
  232        # if config.get("memory", None):
  233        #     connexion_config["memory_limit"] = config.get("memory")
  234        if self.get_memory():
  235            connexion_config["memory_limit"] = self.get_memory()
  236
  237        # Temporary directory
  238        if config.get("tmp", None):
  239            connexion_config["temp_directory"] = config.get("tmp")
  240
  241        # Access
  242        if config.get("access", None):
  243            access = config.get("access")
  244            if access in ["RO"]:
  245                access = "READ_ONLY"
  246            elif access in ["RW"]:
  247                access = "READ_WRITE"
  248            connexion_db = self.get_connexion_db()
  249            if connexion_db in ":memory:":
  250                access = "READ_WRITE"
  251            connexion_config["access_mode"] = access
  252
  253        return connexion_config
  254
  255    def get_duckdb_settings(self) -> dict:
  256        """
  257        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  258        string.
  259        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  260        """
  261
  262        # config
  263        config = self.get_config()
  264
  265        # duckdb settings
  266        duckdb_settings_dict = {}
  267        if config.get("duckdb_settings", None):
  268            duckdb_settings = config.get("duckdb_settings")
  269            duckdb_settings = full_path(duckdb_settings)
  270            # duckdb setting is a file
  271            if os.path.exists(duckdb_settings):
  272                with open(duckdb_settings) as json_file:
  273                    duckdb_settings_dict = yaml.safe_load(json_file)
  274            # duckdb settings is a string
  275            else:
  276                duckdb_settings_dict = json.loads(duckdb_settings)
  277
  278        return duckdb_settings_dict
  279
  280    def set_connexion_db(self) -> str:
  281        """
  282        The function `set_connexion_db` returns the appropriate database connection string based on the
  283        input format and connection type.
  284        :return: the value of the variable `connexion_db`.
  285        """
  286
  287        # Default connexion db
  288        default_connexion_db = ":memory:"
  289
  290        # Find connexion db
  291        if self.get_input_format() in ["db", "duckdb"]:
  292            connexion_db = self.get_input()
  293        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  294            connexion_db = default_connexion_db
  295        elif self.get_connexion_type() in ["tmpfile"]:
  296            tmp_name = tempfile.mkdtemp(
  297                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  298            )
  299            connexion_db = f"{tmp_name}/tmp.db"
  300        elif self.get_connexion_type() != "":
  301            connexion_db = self.get_connexion_type()
  302        else:
  303            connexion_db = default_connexion_db
  304
  305        # Set connexion db
  306        self.connexion_db = connexion_db
  307
  308        return connexion_db
  309
  310    def set_connexion(self, conn) -> None:
  311        """
  312        The function `set_connexion` creates a connection to a database, with options for different
  313        database formats and settings.
  314
  315        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  316        database. If a connection is not provided, a new connection to an in-memory database is created.
  317        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  318        sqlite
  319        """
  320
  321        # Connexion db
  322        connexion_db = self.set_connexion_db()
  323
  324        # Connexion config
  325        connexion_config = self.get_connexion_config()
  326
  327        # Connexion format
  328        connexion_format = self.get_config().get("connexion_format", "duckdb")
  329        # Set connexion format
  330        self.connexion_format = connexion_format
  331
  332        # Connexion
  333        if not conn:
  334            if connexion_format in ["duckdb"]:
  335                conn = duckdb.connect(connexion_db, config=connexion_config)
  336                # duckDB settings
  337                duckdb_settings = self.get_duckdb_settings()
  338                if duckdb_settings:
  339                    for setting in duckdb_settings:
  340                        setting_value = duckdb_settings.get(setting)
  341                        if isinstance(setting_value, str):
  342                            setting_value = f"'{setting_value}'"
  343                        conn.execute(f"PRAGMA {setting}={setting_value};")
  344            elif connexion_format in ["sqlite"]:
  345                conn = sqlite3.connect(connexion_db)
  346
  347        # Set connexion
  348        self.conn = conn
  349
  350        # Log
  351        log.debug(f"connexion_format: {connexion_format}")
  352        log.debug(f"connexion_db: {connexion_db}")
  353        log.debug(f"connexion config: {connexion_config}")
  354        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  355
  356    def set_output(self, output: str = None) -> None:
  357        """
  358        The `set_output` function in Python sets the output file based on the input or a specified key
  359        in the config file, extracting the output name, extension, and format.
  360
  361        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  362        the output file. If the config file has an 'output' key, the method sets the output to the value
  363        of that key. If no output is provided, it sets the output to `None`
  364        :type output: str
  365        """
  366
  367        if output and not isinstance(output, str):
  368            self.output = output.name
  369        else:
  370            self.output = output
  371
  372        # Output format
  373        if self.output:
  374            output_name, output_extension = os.path.splitext(self.output)
  375            self.output_name = output_name
  376            self.output_extension = output_extension
  377            self.output_format = self.output_extension.replace(".", "")
  378        else:
  379            self.output_name = None
  380            self.output_extension = None
  381            self.output_format = None
  382
  383    def set_header(self) -> None:
  384        """
  385        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  386        """
  387
  388        input_file = self.get_input()
  389        default_header_list = [
  390            "##fileformat=VCFv4.2",
  391            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  392        ]
  393
  394        # Full path
  395        input_file = full_path(input_file)
  396
  397        if input_file:
  398
  399            input_format = self.get_input_format()
  400            input_compressed = self.get_input_compressed()
  401            config = self.get_config()
  402            header_list = default_header_list
  403            if input_format in [
  404                "vcf",
  405                "hdr",
  406                "tsv",
  407                "csv",
  408                "psv",
  409                "parquet",
  410                "db",
  411                "duckdb",
  412            ]:
  413                # header provided in param
  414                if config.get("header_file", None):
  415                    with open(config.get("header_file"), "rt") as f:
  416                        header_list = self.read_vcf_header(f)
  417                # within a vcf file format (header within input file itsself)
  418                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  419                    # within a compressed vcf file format (.vcf.gz)
  420                    if input_compressed:
  421                        with bgzf.open(input_file, "rt") as f:
  422                            header_list = self.read_vcf_header(f)
  423                    # within an uncompressed vcf file format (.vcf)
  424                    else:
  425                        with open(input_file, "rt") as f:
  426                            header_list = self.read_vcf_header(f)
  427                # header provided in default external file .hdr
  428                elif os.path.exists((input_file + ".hdr")):
  429                    with open(input_file + ".hdr", "rt") as f:
  430                        header_list = self.read_vcf_header(f)
  431                else:
  432                    try:  # Try to get header info fields and file columns
  433
  434                        with tempfile.TemporaryDirectory() as tmpdir:
  435
  436                            # Create database
  437                            db_for_header = Database(database=input_file)
  438
  439                            # Get header columns for infos fields
  440                            db_header_from_columns = (
  441                                db_for_header.get_header_from_columns()
  442                            )
  443
  444                            # Get real columns in the file
  445                            db_header_columns = db_for_header.get_columns()
  446
  447                            # Write header file
  448                            header_file_tmp = os.path.join(tmpdir, "header")
  449                            f = open(header_file_tmp, "w")
  450                            vcf.Writer(f, db_header_from_columns)
  451                            f.close()
  452
  453                            # Replace #CHROM line with rel columns
  454                            header_list = db_for_header.read_header_file(
  455                                header_file=header_file_tmp
  456                            )
  457                            header_list[-1] = "\t".join(db_header_columns)
  458
  459                    except:
  460
  461                        log.warning(
  462                            f"No header for file {input_file}. Set as default VCF header"
  463                        )
  464                        header_list = default_header_list
  465
  466            else:  # try for unknown format ?
  467
  468                log.error(f"Input file format '{input_format}' not available")
  469                raise ValueError(f"Input file format '{input_format}' not available")
  470
  471            if not header_list:
  472                header_list = default_header_list
  473
  474            # header as list
  475            self.header_list = header_list
  476
  477            # header as VCF object
  478            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  479
  480        else:
  481
  482            self.header_list = None
  483            self.header_vcf = None
  484
  485    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  486        """
  487        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  488        DataFrame based on the connection format.
  489
  490        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  491        represents the SQL query you want to execute. This query will be used to fetch data from a
  492        database and convert it into a pandas DataFrame
  493        :type query: str
  494        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  495        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  496        function will only fetch up to that number of rows from the database query result. If no limit
  497        is specified,
  498        :type limit: int
  499        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  500        """
  501
  502        # Connexion format
  503        connexion_format = self.get_connexion_format()
  504
  505        # Limit in query
  506        if limit:
  507            pd.set_option("display.max_rows", limit)
  508            if connexion_format in ["duckdb"]:
  509                df = (
  510                    self.conn.execute(query)
  511                    .fetch_record_batch(limit)
  512                    .read_next_batch()
  513                    .to_pandas()
  514                )
  515            elif connexion_format in ["sqlite"]:
  516                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  517
  518        # Full query
  519        else:
  520            if connexion_format in ["duckdb"]:
  521                df = self.conn.execute(query).df()
  522            elif connexion_format in ["sqlite"]:
  523                df = pd.read_sql_query(query, self.conn)
  524
  525        return df
  526
  527    def get_overview(self) -> None:
  528        """
  529        The function prints the input, output, config, and dataframe of the current object
  530        """
  531        table_variants_from = self.get_table_variants(clause="from")
  532        sql_columns = self.get_header_columns_as_sql()
  533        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  534        df = self.get_query_to_df(sql_query_export)
  535        log.info(
  536            "Input:  "
  537            + str(self.get_input())
  538            + " ["
  539            + str(str(self.get_input_format()))
  540            + "]"
  541        )
  542        log.info(
  543            "Output: "
  544            + str(self.get_output())
  545            + " ["
  546            + str(str(self.get_output_format()))
  547            + "]"
  548        )
  549        log.info("Config: ")
  550        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  551            "\n"
  552        ):
  553            log.info("\t" + str(d))
  554        log.info("Param: ")
  555        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  556            "\n"
  557        ):
  558            log.info("\t" + str(d))
  559        log.info("Sample list: " + str(self.get_header_sample_list()))
  560        log.info("Dataframe: ")
  561        for d in str(df).split("\n"):
  562            log.info("\t" + str(d))
  563
  564        # garbage collector
  565        del df
  566        gc.collect()
  567
  568        return None
  569
  570    def get_stats(self) -> dict:
  571        """
  572        The `get_stats` function calculates and returns various statistics of the current object,
  573        including information about the input file, variants, samples, header fields, quality, and
  574        SNVs/InDels.
  575        :return: a dictionary containing various statistics of the current object. The dictionary has
  576        the following structure:
  577        """
  578
  579        # Log
  580        log.info(f"Stats Calculation...")
  581
  582        # table varaints
  583        table_variants_from = self.get_table_variants()
  584
  585        # stats dict
  586        stats = {"Infos": {}}
  587
  588        ### File
  589        input_file = self.get_input()
  590        stats["Infos"]["Input file"] = input_file
  591
  592        # Header
  593        header_infos = self.get_header().infos
  594        header_formats = self.get_header().formats
  595        header_infos_list = list(header_infos)
  596        header_formats_list = list(header_formats)
  597
  598        ### Variants
  599
  600        stats["Variants"] = {}
  601
  602        # Variants by chr
  603        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  604        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  605        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  606            by=["CHROM"], kind="quicksort"
  607        )
  608
  609        # Total number of variants
  610        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  611
  612        # Calculate percentage
  613        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  614            lambda x: (x / nb_of_variants)
  615        )
  616
  617        stats["Variants"]["Number of variants by chromosome"] = (
  618            nb_of_variants_by_chrom.to_dict(orient="index")
  619        )
  620
  621        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  622
  623        ### Samples
  624
  625        # Init
  626        samples = {}
  627        nb_of_samples = 0
  628
  629        # Check Samples
  630        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  631            log.debug(f"Check samples...")
  632            for sample in self.get_header_sample_list():
  633                sql_query_samples = f"""
  634                    SELECT  '{sample}' as sample,
  635                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  636                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  637                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  638                    FROM {table_variants_from}
  639                    WHERE (
  640                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  641                        AND
  642                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  643                      )
  644                    GROUP BY genotype
  645                    """
  646                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  647                sample_genotype_count = sql_query_genotype_df["count"].sum()
  648                if len(sql_query_genotype_df):
  649                    nb_of_samples += 1
  650                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  651                        sql_query_genotype_df.to_dict(orient="index")
  652                    )
  653
  654            stats["Samples"] = samples
  655            stats["Infos"]["Number of samples"] = nb_of_samples
  656
  657        # #
  658        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  659        #     stats["Infos"]["Number of samples"] = nb_of_samples
  660        # elif nb_of_samples:
  661        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  662
  663        ### INFO and FORMAT fields
  664        header_types_df = {}
  665        header_types_list = {
  666            "List of INFO fields": header_infos,
  667            "List of FORMAT fields": header_formats,
  668        }
  669        i = 0
  670        for header_type in header_types_list:
  671
  672            header_type_infos = header_types_list.get(header_type)
  673            header_infos_dict = {}
  674
  675            for info in header_type_infos:
  676
  677                i += 1
  678                header_infos_dict[i] = {}
  679
  680                # ID
  681                header_infos_dict[i]["id"] = info
  682
  683                # num
  684                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  685                if header_type_infos[info].num in genotype_map.keys():
  686                    header_infos_dict[i]["Number"] = genotype_map.get(
  687                        header_type_infos[info].num
  688                    )
  689                else:
  690                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  691
  692                # type
  693                if header_type_infos[info].type:
  694                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  695                else:
  696                    header_infos_dict[i]["Type"] = "."
  697
  698                # desc
  699                if header_type_infos[info].desc != None:
  700                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  701                else:
  702                    header_infos_dict[i]["Description"] = ""
  703
  704            if len(header_infos_dict):
  705                header_types_df[header_type] = pd.DataFrame.from_dict(
  706                    header_infos_dict, orient="index"
  707                ).to_dict(orient="index")
  708
  709        # Stats
  710        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  711        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  712        stats["Header"] = header_types_df
  713
  714        ### QUAL
  715        if "QUAL" in self.get_header_columns():
  716            sql_query_qual = f"""
  717                    SELECT
  718                        avg(CAST(QUAL AS INTEGER)) AS Average,
  719                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  720                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  721                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  722                        median(CAST(QUAL AS INTEGER)) AS Median,
  723                        variance(CAST(QUAL AS INTEGER)) AS Variance
  724                    FROM {table_variants_from}
  725                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  726                    """
  727
  728            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  729            stats["Quality"] = {"Stats": qual}
  730
  731        ### SNV and InDel
  732
  733        sql_query_snv = f"""
  734            
  735            SELECT Type, count FROM (
  736
  737                    SELECT
  738                        'Total' AS Type,
  739                        count(*) AS count
  740                    FROM {table_variants_from}
  741
  742                    UNION
  743
  744                    SELECT
  745                        'MNV' AS Type,
  746                        count(*) AS count
  747                    FROM {table_variants_from}
  748                    WHERE len(REF) > 1 AND len(ALT) > 1
  749                    AND len(REF) = len(ALT)
  750
  751                    UNION
  752
  753                    SELECT
  754                        'InDel' AS Type,
  755                        count(*) AS count
  756                    FROM {table_variants_from}
  757                    WHERE len(REF) > 1 OR len(ALT) > 1
  758                    AND len(REF) != len(ALT)
  759                    
  760                    UNION
  761
  762                    SELECT
  763                        'SNV' AS Type,
  764                        count(*) AS count
  765                    FROM {table_variants_from}
  766                    WHERE len(REF) = 1 AND len(ALT) = 1
  767
  768                )
  769
  770            ORDER BY count DESC
  771
  772                """
  773        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  774
  775        sql_query_snv_substitution = f"""
  776                SELECT
  777                    concat(REF, '>', ALT) AS 'Substitution',
  778                    count(*) AS count
  779                FROM {table_variants_from}
  780                WHERE len(REF) = 1 AND len(ALT) = 1
  781                GROUP BY REF, ALT
  782                ORDER BY count(*) DESC
  783                """
  784        snv_substitution = (
  785            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  786        )
  787        stats["Variants"]["Counts"] = snv_indel
  788        stats["Variants"]["Substitutions"] = snv_substitution
  789
  790        return stats
  791
  792    def stats_to_file(self, file: str = None) -> str:
  793        """
  794        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  795        into a JSON object, and writes the JSON object to the specified file.
  796
  797        :param file: The `file` parameter is a string that represents the file path where the JSON data
  798        will be written
  799        :type file: str
  800        :return: the name of the file that was written to.
  801        """
  802
  803        # Get stats
  804        stats = self.get_stats()
  805
  806        # Serializing json
  807        json_object = json.dumps(stats, indent=4)
  808
  809        # Writing to sample.json
  810        with open(file, "w") as outfile:
  811            outfile.write(json_object)
  812
  813        return file
  814
  815    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  816        """
  817        The `print_stats` function generates a markdown file and prints the statistics contained in a
  818        JSON file in a formatted manner.
  819
  820        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  821        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  822        provided, a temporary directory will be created and the stats will be saved in a file named
  823        "stats.md" within that
  824        :type output_file: str
  825        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  826        file where the statistics will be saved. If no value is provided, a temporary directory will be
  827        created and a default file name "stats.json" will be used
  828        :type json_file: str
  829        :return: The function `print_stats` does not return any value. It has a return type annotation
  830        of `None`.
  831        """
  832
  833        # Full path
  834        output_file = full_path(output_file)
  835        json_file = full_path(json_file)
  836
  837        with tempfile.TemporaryDirectory() as tmpdir:
  838
  839            # Files
  840            if not output_file:
  841                output_file = os.path.join(tmpdir, "stats.md")
  842            if not json_file:
  843                json_file = os.path.join(tmpdir, "stats.json")
  844
  845            # Create folders
  846            if not os.path.exists(os.path.dirname(output_file)):
  847                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  848            if not os.path.exists(os.path.dirname(json_file)):
  849                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  850
  851            # Create stats JSON file
  852            stats_file = self.stats_to_file(file=json_file)
  853
  854            # Print stats file
  855            with open(stats_file) as f:
  856                stats = yaml.safe_load(f)
  857
  858            # Output
  859            output_title = []
  860            output_index = []
  861            output = []
  862
  863            # Title
  864            output_title.append("# HOWARD Stats")
  865
  866            # Index
  867            output_index.append("## Index")
  868
  869            # Process sections
  870            for section in stats:
  871                infos = stats.get(section)
  872                section_link = "#" + section.lower().replace(" ", "-")
  873                output.append(f"## {section}")
  874                output_index.append(f"- [{section}]({section_link})")
  875
  876                if len(infos):
  877                    for info in infos:
  878                        try:
  879                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  880                            is_df = True
  881                        except:
  882                            try:
  883                                df = pd.DataFrame.from_dict(
  884                                    json.loads((infos.get(info))), orient="index"
  885                                )
  886                                is_df = True
  887                            except:
  888                                is_df = False
  889                        if is_df:
  890                            output.append(f"### {info}")
  891                            info_link = "#" + info.lower().replace(" ", "-")
  892                            output_index.append(f"   - [{info}]({info_link})")
  893                            output.append(f"{df.to_markdown(index=False)}")
  894                        else:
  895                            output.append(f"- {info}: {infos.get(info)}")
  896                else:
  897                    output.append(f"NA")
  898
  899            # Write stats in markdown file
  900            with open(output_file, "w") as fp:
  901                for item in output_title:
  902                    fp.write("%s\n" % item)
  903                for item in output_index:
  904                    fp.write("%s\n" % item)
  905                for item in output:
  906                    fp.write("%s\n" % item)
  907
  908            # Output stats in markdown
  909            print("")
  910            print("\n\n".join(output_title))
  911            print("")
  912            print("\n\n".join(output))
  913            print("")
  914
  915        return None
  916
  917    def get_input(self) -> str:
  918        """
  919        It returns the value of the input variable.
  920        :return: The input is being returned.
  921        """
  922        return self.input
  923
  924    def get_input_format(self, input_file: str = None) -> str:
  925        """
  926        This function returns the format of the input variable, either from the provided input file or
  927        by prompting for input.
  928
  929        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  930        represents the file path of the input file. If no `input_file` is provided when calling the
  931        method, it will default to `None`
  932        :type input_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not input_file:
  937            input_file = self.get_input()
  938        input_format = get_file_format(input_file)
  939        return input_format
  940
  941    def get_input_compressed(self, input_file: str = None) -> str:
  942        """
  943        The function `get_input_compressed` returns the format of the input variable after compressing
  944        it.
  945
  946        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  947        that represents the file path of the input file. If no `input_file` is provided when calling the
  948        method, it will default to `None` and the method will then call `self.get_input()` to
  949        :type input_file: str
  950        :return: The function `get_input_compressed` returns the compressed format of the input
  951        variable.
  952        """
  953
  954        if not input_file:
  955            input_file = self.get_input()
  956        input_compressed = get_file_compressed(input_file)
  957        return input_compressed
  958
  959    def get_output(self) -> str:
  960        """
  961        It returns the output of the neuron.
  962        :return: The output of the neural network.
  963        """
  964
  965        return self.output
  966
  967    def get_output_format(self, output_file: str = None) -> str:
  968        """
  969        The function `get_output_format` returns the format of the input variable or the output file if
  970        provided.
  971
  972        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  973        that represents the file path of the output file. If no `output_file` is provided when calling
  974        the method, it will default to the output obtained from the `get_output` method of the class
  975        instance. The
  976        :type output_file: str
  977        :return: The format of the input variable is being returned.
  978        """
  979
  980        if not output_file:
  981            output_file = self.get_output()
  982        output_format = get_file_format(output_file)
  983
  984        return output_format
  985
  986    def get_config(self) -> dict:
  987        """
  988        It returns the config
  989        :return: The config variable is being returned.
  990        """
  991        return self.config
  992
  993    def get_param(self) -> dict:
  994        """
  995        It returns the param
  996        :return: The param variable is being returned.
  997        """
  998        return self.param
  999
 1000    def get_connexion_db(self) -> str:
 1001        """
 1002        It returns the connexion_db attribute of the object
 1003        :return: The connexion_db is being returned.
 1004        """
 1005        return self.connexion_db
 1006
 1007    def get_prefix(self) -> str:
 1008        """
 1009        It returns the prefix of the object.
 1010        :return: The prefix is being returned.
 1011        """
 1012        return self.prefix
 1013
 1014    def get_table_variants(self, clause: str = "select") -> str:
 1015        """
 1016        This function returns the table_variants attribute of the object
 1017
 1018        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1019        defaults to select (optional)
 1020        :return: The table_variants attribute of the object.
 1021        """
 1022
 1023        # Access
 1024        access = self.get_config().get("access", None)
 1025
 1026        # Clauses "select", "where", "update"
 1027        if clause in ["select", "where", "update"]:
 1028            table_variants = self.table_variants
 1029        # Clause "from"
 1030        elif clause in ["from"]:
 1031            # For Read Only
 1032            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1033                input_file = self.get_input()
 1034                table_variants = f"'{input_file}' as variants"
 1035            # For Read Write
 1036            else:
 1037                table_variants = f"{self.table_variants} as variants"
 1038        else:
 1039            table_variants = self.table_variants
 1040        return table_variants
 1041
 1042    def get_tmp_dir(self) -> str:
 1043        """
 1044        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1045        parameters or a default path.
 1046        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1047        configuration, parameters, and a default value of "/tmp".
 1048        """
 1049
 1050        return get_tmp(
 1051            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1052        )
 1053
 1054    def get_connexion_type(self) -> str:
 1055        """
 1056        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1057
 1058        :return: The connexion type is being returned.
 1059        """
 1060        return self.get_config().get("connexion_type", "memory")
 1061
 1062    def get_connexion(self):
 1063        """
 1064        It returns the connection object
 1065
 1066        :return: The connection object.
 1067        """
 1068        return self.conn
 1069
 1070    def close_connexion(self) -> None:
 1071        """
 1072        This function closes the connection to the database.
 1073        :return: The connection is being closed.
 1074        """
 1075        return self.conn.close()
 1076
 1077    def get_header(self, type: str = "vcf"):
 1078        """
 1079        This function returns the header of the VCF file as a list of strings
 1080
 1081        :param type: the type of header you want to get, defaults to vcf (optional)
 1082        :return: The header of the vcf file.
 1083        """
 1084
 1085        if self.header_vcf:
 1086            if type == "vcf":
 1087                return self.header_vcf
 1088            elif type == "list":
 1089                return self.header_list
 1090        else:
 1091            if type == "vcf":
 1092                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1093                return header
 1094            elif type == "list":
 1095                return vcf_required
 1096
 1097    def get_header_infos_list(self) -> list:
 1098        """
 1099        This function retrieves a list of information fields from the header.
 1100        :return: A list of information fields from the header.
 1101        """
 1102
 1103        # Init
 1104        infos_list = []
 1105
 1106        for field in self.get_header().infos:
 1107            infos_list.append(field)
 1108
 1109        return infos_list
 1110
 1111    def get_header_length(self, file: str = None) -> int:
 1112        """
 1113        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1114        line.
 1115
 1116        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1117        header file. If this argument is provided, the function will read the header from the specified
 1118        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1119        :type file: str
 1120        :return: the length of the header list, excluding the #CHROM line.
 1121        """
 1122
 1123        if file:
 1124            return len(self.read_vcf_header_file(file=file)) - 1
 1125        elif self.get_header(type="list"):
 1126            return len(self.get_header(type="list")) - 1
 1127        else:
 1128            return 0
 1129
 1130    def get_header_columns(self) -> str:
 1131        """
 1132        This function returns the header list of a VCF
 1133
 1134        :return: The length of the header list.
 1135        """
 1136        if self.get_header():
 1137            return self.get_header(type="list")[-1]
 1138        else:
 1139            return ""
 1140
 1141    def get_header_columns_as_list(self) -> list:
 1142        """
 1143        This function returns the header list of a VCF
 1144
 1145        :return: The length of the header list.
 1146        """
 1147        if self.get_header():
 1148            return self.get_header_columns().strip().split("\t")
 1149        else:
 1150            return []
 1151
 1152    def get_header_columns_as_sql(self) -> str:
 1153        """
 1154        This function retruns header length (without #CHROM line)
 1155
 1156        :return: The length of the header list.
 1157        """
 1158        sql_column_list = []
 1159        for col in self.get_header_columns_as_list():
 1160            sql_column_list.append(f'"{col}"')
 1161        return ",".join(sql_column_list)
 1162
 1163    def get_header_sample_list(
 1164        self, check: bool = False, samples: list = None, samples_force: bool = False
 1165    ) -> list:
 1166        """
 1167        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1168        checking and filtering based on input parameters.
 1169
 1170        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1171        parameter that determines whether to check if the samples in the list are properly defined as
 1172        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1173        list is defined as a, defaults to False
 1174        :type check: bool (optional)
 1175        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1176        allows you to specify a subset of samples from the header. If you provide a list of sample
 1177        names, the function will check if each sample is defined in the header. If a sample is not found
 1178        in the
 1179        :type samples: list
 1180        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1181        a boolean parameter that determines whether to force the function to return the sample list
 1182        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1183        function will return the sample list without performing, defaults to False
 1184        :type samples_force: bool (optional)
 1185        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1186        parameters and conditions specified in the function.
 1187        """
 1188
 1189        # Init
 1190        samples_list = []
 1191
 1192        if samples is None:
 1193            samples_list = self.header_vcf.samples
 1194        else:
 1195            samples_checked = []
 1196            for sample in samples:
 1197                if sample in self.header_vcf.samples:
 1198                    samples_checked.append(sample)
 1199                else:
 1200                    log.warning(f"Sample '{sample}' not defined in header")
 1201            samples_list = samples_checked
 1202
 1203            # Force sample list without checking if is_genotype_column
 1204            if samples_force:
 1205                log.warning(f"Samples {samples_list} not checked if genotypes")
 1206                return samples_list
 1207
 1208        if check:
 1209            samples_checked = []
 1210            for sample in samples_list:
 1211                if self.is_genotype_column(column=sample):
 1212                    samples_checked.append(sample)
 1213                else:
 1214                    log.warning(
 1215                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1216                    )
 1217            samples_list = samples_checked
 1218
 1219        # Return samples list
 1220        return samples_list
 1221
 1222    def is_genotype_column(self, column: str = None) -> bool:
 1223        """
 1224        This function checks if a given column is a genotype column in a database.
 1225
 1226        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1227        represents the column name in a database table. This method checks if the specified column is a
 1228        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1229        method of
 1230        :type column: str
 1231        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1232        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1233        column name and returns the result. If the `column` parameter is None, it returns False.
 1234        """
 1235
 1236        if column is not None:
 1237            return Database(database=self.get_input()).is_genotype_column(column=column)
 1238        else:
 1239            return False
 1240
 1241    def get_verbose(self) -> bool:
 1242        """
 1243        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1244        exist
 1245
 1246        :return: The value of the key "verbose" in the config dictionary.
 1247        """
 1248        return self.get_config().get("verbose", False)
 1249
 1250    def get_connexion_format(self) -> str:
 1251        """
 1252        It returns the connexion format of the object.
 1253        :return: The connexion_format is being returned.
 1254        """
 1255        connexion_format = self.connexion_format
 1256        if connexion_format not in ["duckdb", "sqlite"]:
 1257            log.error(f"Unknown connexion format {connexion_format}")
 1258            raise ValueError(f"Unknown connexion format {connexion_format}")
 1259        else:
 1260            return connexion_format
 1261
 1262    def insert_file_to_table(
 1263        self,
 1264        file,
 1265        columns: str,
 1266        header_len: int = 0,
 1267        sep: str = "\t",
 1268        chunksize: int = 1000000,
 1269    ) -> None:
 1270        """
 1271        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1272        database format.
 1273
 1274        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1275        the path to the file on your system
 1276        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1277        should contain the names of the columns in the table where the data will be inserted. The column
 1278        names should be separated by commas within the string. For example, if you have columns named
 1279        "id", "name
 1280        :type columns: str
 1281        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1282        the number of lines to skip at the beginning of the file before reading the actual data. This
 1283        parameter allows you to skip any header information present in the file before processing the
 1284        data, defaults to 0
 1285        :type header_len: int (optional)
 1286        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1287        separator character that is used in the file being read. In this case, the default separator is
 1288        set to `\t`, which represents a tab character. You can change this parameter to a different
 1289        separator character if, defaults to \t
 1290        :type sep: str (optional)
 1291        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1292        when processing the file in chunks. In the provided code snippet, the default value for
 1293        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1294        to 1000000
 1295        :type chunksize: int (optional)
 1296        """
 1297
 1298        # Config
 1299        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1300        connexion_format = self.get_connexion_format()
 1301
 1302        log.debug("chunksize: " + str(chunksize))
 1303
 1304        if chunksize:
 1305            for chunk in pd.read_csv(
 1306                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1307            ):
 1308                if connexion_format in ["duckdb"]:
 1309                    sql_insert_into = (
 1310                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1311                    )
 1312                    self.conn.execute(sql_insert_into)
 1313                elif connexion_format in ["sqlite"]:
 1314                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1315
 1316    def load_data(
 1317        self,
 1318        input_file: str = None,
 1319        drop_variants_table: bool = False,
 1320        sample_size: int = 20480,
 1321    ) -> None:
 1322        """
 1323        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1324        table before loading the data and specify a sample size.
 1325
 1326        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1327        table
 1328        :type input_file: str
 1329        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1330        determines whether the variants table should be dropped before loading the data. If set to
 1331        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1332        not be dropped, defaults to False
 1333        :type drop_variants_table: bool (optional)
 1334        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1335        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1336        20480
 1337        :type sample_size: int (optional)
 1338        """
 1339
 1340        log.info("Loading...")
 1341
 1342        # change input file
 1343        if input_file:
 1344            self.set_input(input_file)
 1345            self.set_header()
 1346
 1347        # drop variants table
 1348        if drop_variants_table:
 1349            self.drop_variants_table()
 1350
 1351        # get table variants
 1352        table_variants = self.get_table_variants()
 1353
 1354        # Access
 1355        access = self.get_config().get("access", None)
 1356        log.debug(f"access: {access}")
 1357
 1358        # Input format and compress
 1359        input_format = self.get_input_format()
 1360        input_compressed = self.get_input_compressed()
 1361        log.debug(f"input_format: {input_format}")
 1362        log.debug(f"input_compressed: {input_compressed}")
 1363
 1364        # input_compressed_format
 1365        if input_compressed:
 1366            input_compressed_format = "gzip"
 1367        else:
 1368            input_compressed_format = "none"
 1369        log.debug(f"input_compressed_format: {input_compressed_format}")
 1370
 1371        # Connexion format
 1372        connexion_format = self.get_connexion_format()
 1373
 1374        # Sample size
 1375        if not sample_size:
 1376            sample_size = -1
 1377        log.debug(f"sample_size: {sample_size}")
 1378
 1379        # Load data
 1380        log.debug(f"Load Data from {input_format}")
 1381
 1382        # DuckDB connexion
 1383        if connexion_format in ["duckdb"]:
 1384
 1385            # Database already exists
 1386            if self.input_format in ["db", "duckdb"]:
 1387
 1388                if connexion_format in ["duckdb"]:
 1389                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1390                else:
 1391                    log.error(
 1392                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1393                    )
 1394                    raise ValueError(
 1395                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1396                    )
 1397
 1398            # Load from existing database format
 1399            else:
 1400
 1401                try:
 1402                    # Create Table or View
 1403                    database = Database(database=self.input)
 1404                    sql_from = database.get_sql_from(sample_size=sample_size)
 1405
 1406                    if access in ["RO"]:
 1407                        sql_load = (
 1408                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1409                        )
 1410                    else:
 1411                        sql_load = (
 1412                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1413                        )
 1414                    self.conn.execute(sql_load)
 1415
 1416                except:
 1417                    # Format not available
 1418                    log.error(f"Input file format '{self.input_format}' not available")
 1419                    raise ValueError(
 1420                        f"Input file format '{self.input_format}' not available"
 1421                    )
 1422
 1423        # SQLite connexion
 1424        elif connexion_format in ["sqlite"] and input_format in [
 1425            "vcf",
 1426            "tsv",
 1427            "csv",
 1428            "psv",
 1429        ]:
 1430
 1431            # Main structure
 1432            structure = {
 1433                "#CHROM": "VARCHAR",
 1434                "POS": "INTEGER",
 1435                "ID": "VARCHAR",
 1436                "REF": "VARCHAR",
 1437                "ALT": "VARCHAR",
 1438                "QUAL": "VARCHAR",
 1439                "FILTER": "VARCHAR",
 1440                "INFO": "VARCHAR",
 1441            }
 1442
 1443            # Strcuture with samples
 1444            structure_complete = structure
 1445            if self.get_header_sample_list():
 1446                structure["FORMAT"] = "VARCHAR"
 1447                for sample in self.get_header_sample_list():
 1448                    structure_complete[sample] = "VARCHAR"
 1449
 1450            # Columns list for create and insert
 1451            sql_create_table_columns = []
 1452            sql_create_table_columns_list = []
 1453            for column in structure_complete:
 1454                column_type = structure_complete[column]
 1455                sql_create_table_columns.append(
 1456                    f'"{column}" {column_type} default NULL'
 1457                )
 1458                sql_create_table_columns_list.append(f'"{column}"')
 1459
 1460            # Create database
 1461            log.debug(f"Create Table {table_variants}")
 1462            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1463            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1464            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1465            self.conn.execute(sql_create_table)
 1466
 1467            # chunksize define length of file chunk load file
 1468            chunksize = 100000
 1469
 1470            # delimiter
 1471            delimiter = file_format_delimiters.get(input_format, "\t")
 1472
 1473            # Load the input file
 1474            with open(self.input, "rt") as input_file:
 1475
 1476                # Use the appropriate file handler based on the input format
 1477                if input_compressed:
 1478                    input_file = bgzf.open(self.input, "rt")
 1479                if input_format in ["vcf"]:
 1480                    header_len = self.get_header_length()
 1481                else:
 1482                    header_len = 0
 1483
 1484                # Insert the file contents into a table
 1485                self.insert_file_to_table(
 1486                    input_file,
 1487                    columns=sql_create_table_columns_list_sql,
 1488                    header_len=header_len,
 1489                    sep=delimiter,
 1490                    chunksize=chunksize,
 1491                )
 1492
 1493        else:
 1494            log.error(
 1495                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1496            )
 1497            raise ValueError(
 1498                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1499            )
 1500
 1501        # Explode INFOS fields into table fields
 1502        if self.get_explode_infos():
 1503            self.explode_infos(
 1504                prefix=self.get_explode_infos_prefix(),
 1505                fields=self.get_explode_infos_fields(),
 1506                force=True,
 1507            )
 1508
 1509        # Create index after insertion
 1510        self.create_indexes()
 1511
 1512    def get_explode_infos(self) -> bool:
 1513        """
 1514        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1515        to False if it is not set.
 1516        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1517        value. If the parameter is not present, it will return False.
 1518        """
 1519
 1520        return self.get_param().get("explode", {}).get("explode_infos", False)
 1521
 1522    def get_explode_infos_fields(
 1523        self,
 1524        explode_infos_fields: str = None,
 1525        remove_fields_not_in_header: bool = False,
 1526    ) -> list:
 1527        """
 1528        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1529        the input parameter `explode_infos_fields`.
 1530
 1531        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1532        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1533        comma-separated list of field names to explode
 1534        :type explode_infos_fields: str
 1535        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1536        flag that determines whether to remove fields that are not present in the header. If it is set
 1537        to `True`, any field that is not in the header will be excluded from the list of exploded
 1538        information fields. If it is set to `, defaults to False
 1539        :type remove_fields_not_in_header: bool (optional)
 1540        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1541        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1542        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1543        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1544        splitting the string by commas.
 1545        """
 1546
 1547        # If no fields, get it in param
 1548        if not explode_infos_fields:
 1549            explode_infos_fields = (
 1550                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1551            )
 1552
 1553        # If no fields, defined as all fields in header using keyword
 1554        if not explode_infos_fields:
 1555            explode_infos_fields = "*"
 1556
 1557        # If fields list not empty
 1558        if explode_infos_fields:
 1559
 1560            # Input fields list
 1561            if isinstance(explode_infos_fields, str):
 1562                fields_input = explode_infos_fields.split(",")
 1563            elif isinstance(explode_infos_fields, list):
 1564                fields_input = explode_infos_fields
 1565            else:
 1566                fields_input = []
 1567
 1568            # Fields list without * keyword
 1569            fields_without_all = fields_input.copy()
 1570            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1571                fields_without_all.remove("*")
 1572
 1573            # Fields in header
 1574            fields_in_header = sorted(list(set(self.get_header().infos)))
 1575
 1576            # Construct list of fields
 1577            fields_output = []
 1578            for field in fields_input:
 1579
 1580                # Strip field
 1581                field = field.strip()
 1582
 1583                # format keyword * in regex
 1584                if field.upper() in ["*"]:
 1585                    field = ".*"
 1586
 1587                # Find all fields with pattern
 1588                r = re.compile(rf"^{field}$")
 1589                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1590
 1591                # Remove fields input from search
 1592                if field in fields_search:
 1593                    fields_search = [field]
 1594                elif fields_search != [field]:
 1595                    fields_search = sorted(
 1596                        list(set(fields_search).difference(fields_input))
 1597                    )
 1598
 1599                # If field is not in header (avoid not well formatted header)
 1600                if not fields_search and not remove_fields_not_in_header:
 1601                    fields_search = [field]
 1602
 1603                # Add found fields
 1604                for new_field in fields_search:
 1605                    # Add field, if not already exists, and if it is in header (if asked)
 1606                    if (
 1607                        new_field not in fields_output
 1608                        and (
 1609                            not remove_fields_not_in_header
 1610                            or new_field in fields_in_header
 1611                        )
 1612                        and new_field not in [".*"]
 1613                    ):
 1614                        fields_output.append(new_field)
 1615
 1616            return fields_output
 1617
 1618        else:
 1619
 1620            return []
 1621
 1622    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1623        """
 1624        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1625        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1626        not provided.
 1627
 1628        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1629        prefix to be used for exploding or expanding information
 1630        :type explode_infos_prefix: str
 1631        :return: the value of the variable `explode_infos_prefix`.
 1632        """
 1633
 1634        if not explode_infos_prefix:
 1635            explode_infos_prefix = (
 1636                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1637            )
 1638
 1639        return explode_infos_prefix
 1640
 1641    def add_column(
 1642        self,
 1643        table_name,
 1644        column_name,
 1645        column_type,
 1646        default_value=None,
 1647        drop: bool = False,
 1648    ) -> dict:
 1649        """
 1650        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1651        doesn't already exist.
 1652
 1653        :param table_name: The name of the table to which you want to add a column
 1654        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1655        to the table
 1656        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1657        want to add to the table. It should be a string that represents the desired data type, such as
 1658        "INTEGER", "TEXT", "REAL", etc
 1659        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1660        default value for the newly added column. If a default value is provided, it will be assigned to
 1661        the column for any existing rows that do not have a value for that column
 1662        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1663        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1664        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1665        to False
 1666        :type drop: bool (optional)
 1667        :return: a boolean value indicating whether the column was successfully added to the table.
 1668        """
 1669
 1670        # added
 1671        added = False
 1672        dropped = False
 1673
 1674        # Check if the column already exists in the table
 1675        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1676        columns = self.get_query_to_df(query).columns.tolist()
 1677        if column_name.upper() in [c.upper() for c in columns]:
 1678            log.debug(
 1679                f"The {column_name} column already exists in the {table_name} table"
 1680            )
 1681            if drop:
 1682                self.drop_column(table_name=table_name, column_name=column_name)
 1683                dropped = True
 1684            else:
 1685                return None
 1686        else:
 1687            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1688
 1689        # Add column in table
 1690        add_column_query = (
 1691            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1692        )
 1693        if default_value is not None:
 1694            add_column_query += f" DEFAULT {default_value}"
 1695        self.execute_query(add_column_query)
 1696        added = not dropped
 1697        log.debug(
 1698            f"The {column_name} column was successfully added to the {table_name} table"
 1699        )
 1700
 1701        if added:
 1702            added_column = {
 1703                "table_name": table_name,
 1704                "column_name": column_name,
 1705                "column_type": column_type,
 1706                "default_value": default_value,
 1707            }
 1708        else:
 1709            added_column = None
 1710
 1711        return added_column
 1712
 1713    def drop_column(
 1714        self, column: dict = None, table_name: str = None, column_name: str = None
 1715    ) -> bool:
 1716        """
 1717        The `drop_column` function drops a specified column from a given table in a database and returns
 1718        True if the column was successfully dropped, and False if the column does not exist in the
 1719        table.
 1720
 1721        :param column: The `column` parameter is a dictionary that contains information about the column
 1722        you want to drop. It has two keys:
 1723        :type column: dict
 1724        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1725        drop a column
 1726        :type table_name: str
 1727        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1728        from the table
 1729        :type column_name: str
 1730        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1731        and False if the column does not exist in the table.
 1732        """
 1733
 1734        # Find column infos
 1735        if column:
 1736            if isinstance(column, dict):
 1737                table_name = column.get("table_name", None)
 1738                column_name = column.get("column_name", None)
 1739            elif isinstance(column, str):
 1740                table_name = self.get_table_variants()
 1741                column_name = column
 1742            else:
 1743                table_name = None
 1744                column_name = None
 1745
 1746        if not table_name and not column_name:
 1747            return False
 1748
 1749        # Removed
 1750        removed = False
 1751
 1752        # Check if the column already exists in the table
 1753        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1754        columns = self.get_query_to_df(query).columns.tolist()
 1755        if column_name in columns:
 1756            log.debug(f"The {column_name} column exists in the {table_name} table")
 1757        else:
 1758            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1759            return False
 1760
 1761        # Add column in table # ALTER TABLE integers DROP k
 1762        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1763        self.execute_query(add_column_query)
 1764        removed = True
 1765        log.debug(
 1766            f"The {column_name} column was successfully dropped to the {table_name} table"
 1767        )
 1768
 1769        return removed
 1770
 1771    def explode_infos(
 1772        self,
 1773        prefix: str = None,
 1774        create_index: bool = False,
 1775        fields: list = None,
 1776        force: bool = False,
 1777        proccess_all_fields_together: bool = False,
 1778        table: str = None,
 1779    ) -> list:
 1780        """
 1781        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1782        individual columns, returning a list of added columns.
 1783
 1784        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1785        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1786        `self.get_explode_infos_prefix()` as the prefix
 1787        :type prefix: str
 1788        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1789        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1790        `False`, indexes will not be created. The default value is `False`, defaults to False
 1791        :type create_index: bool (optional)
 1792        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1793        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1794        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1795        a list to the `
 1796        :type fields: list
 1797        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1798        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1799        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1800        defaults to False
 1801        :type force: bool (optional)
 1802        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1803        flag that determines whether to process all the INFO fields together or individually. If set to
 1804        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1805        be processed individually. The default value is, defaults to False
 1806        :type proccess_all_fields_together: bool (optional)
 1807        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1808        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1809        a value for the `table` parameter, the function will use that table name. If the `table`
 1810        parameter is
 1811        :type table: str
 1812        :return: The `explode_infos` function returns a list of added columns.
 1813        """
 1814
 1815        # drop indexes
 1816        self.drop_indexes()
 1817
 1818        # connexion format
 1819        connexion_format = self.get_connexion_format()
 1820
 1821        # Access
 1822        access = self.get_config().get("access", None)
 1823
 1824        # Added columns
 1825        added_columns = []
 1826
 1827        if access not in ["RO"]:
 1828
 1829            # prefix
 1830            if prefix in [None, True] or not isinstance(prefix, str):
 1831                if self.get_explode_infos_prefix() not in [None, True]:
 1832                    prefix = self.get_explode_infos_prefix()
 1833                else:
 1834                    prefix = "INFO/"
 1835
 1836            # table variants
 1837            if table is not None:
 1838                table_variants = table
 1839            else:
 1840                table_variants = self.get_table_variants(clause="select")
 1841
 1842            # extra infos
 1843            try:
 1844                extra_infos = self.get_extra_infos()
 1845            except:
 1846                extra_infos = []
 1847
 1848            # Header infos
 1849            header_infos = self.get_header().infos
 1850
 1851            log.debug(
 1852                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1853            )
 1854
 1855            sql_info_alter_table_array = []
 1856
 1857            # Info fields to check
 1858            fields_list = list(header_infos)
 1859            if fields:
 1860                fields_list += fields
 1861            fields_list = set(fields_list)
 1862
 1863            # If no fields
 1864            if not fields:
 1865                fields = []
 1866
 1867            # Translate fields if patterns
 1868            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1869
 1870            for info in fields:
 1871
 1872                info_id_sql = prefix + info
 1873
 1874                if (
 1875                    info in fields_list
 1876                    or prefix + info in fields_list
 1877                    or info in extra_infos
 1878                ):
 1879
 1880                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1881
 1882                    if info in header_infos:
 1883                        info_type = header_infos[info].type
 1884                        info_num = header_infos[info].num
 1885                    else:
 1886                        info_type = "String"
 1887                        info_num = 0
 1888
 1889                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1890                    if info_num != 1:
 1891                        type_sql = "VARCHAR"
 1892
 1893                    # Add field
 1894                    added_column = self.add_column(
 1895                        table_name=table_variants,
 1896                        column_name=info_id_sql,
 1897                        column_type=type_sql,
 1898                        default_value="null",
 1899                        drop=force,
 1900                    )
 1901
 1902                    if added_column:
 1903                        added_columns.append(added_column)
 1904
 1905                    if added_column or force:
 1906
 1907                        # add field to index
 1908                        self.index_additionnal_fields.append(info_id_sql)
 1909
 1910                        # Update field array
 1911                        if connexion_format in ["duckdb"]:
 1912                            update_info_field = f"""
 1913                            "{info_id_sql}" =
 1914                                CASE
 1915                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1916                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1917                                END
 1918                            """
 1919                        elif connexion_format in ["sqlite"]:
 1920                            update_info_field = f"""
 1921                                "{info_id_sql}" =
 1922                                    CASE
 1923                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1924                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1925                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1926                                    END
 1927                            """
 1928
 1929                        sql_info_alter_table_array.append(update_info_field)
 1930
 1931            if sql_info_alter_table_array:
 1932
 1933                # By chromosomes
 1934                try:
 1935                    chromosomes_list = list(
 1936                        self.get_query_to_df(
 1937                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1938                        )["#CHROM"]
 1939                    )
 1940                except:
 1941                    chromosomes_list = [None]
 1942
 1943                for chrom in chromosomes_list:
 1944                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1945
 1946                    # Where clause
 1947                    where_clause = ""
 1948                    if chrom and len(chromosomes_list) > 1:
 1949                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1950
 1951                    # Update table
 1952                    if proccess_all_fields_together:
 1953                        sql_info_alter_table_array_join = ", ".join(
 1954                            sql_info_alter_table_array
 1955                        )
 1956                        if sql_info_alter_table_array_join:
 1957                            sql_info_alter_table = f"""
 1958                                UPDATE {table_variants}
 1959                                SET {sql_info_alter_table_array_join}
 1960                                {where_clause}
 1961                                """
 1962                            log.debug(
 1963                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1964                            )
 1965                            # log.debug(sql_info_alter_table)
 1966                            self.conn.execute(sql_info_alter_table)
 1967                    else:
 1968                        sql_info_alter_num = 0
 1969                        for sql_info_alter in sql_info_alter_table_array:
 1970                            sql_info_alter_num += 1
 1971                            sql_info_alter_table = f"""
 1972                                UPDATE {table_variants}
 1973                                SET {sql_info_alter}
 1974                                {where_clause}
 1975                                """
 1976                            log.debug(
 1977                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1978                            )
 1979                            # log.debug(sql_info_alter_table)
 1980                            self.conn.execute(sql_info_alter_table)
 1981
 1982        # create indexes
 1983        if create_index:
 1984            self.create_indexes()
 1985
 1986        return added_columns
 1987
 1988    def create_indexes(self) -> None:
 1989        """
 1990        Create indexes on the table after insertion
 1991        """
 1992
 1993        # Access
 1994        access = self.get_config().get("access", None)
 1995
 1996        # get table variants
 1997        table_variants = self.get_table_variants("FROM")
 1998
 1999        if self.get_indexing() and access not in ["RO"]:
 2000            # Create index
 2001            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2002            self.conn.execute(sql_create_table_index)
 2003            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2004            self.conn.execute(sql_create_table_index)
 2005            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2006            self.conn.execute(sql_create_table_index)
 2007            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2008            self.conn.execute(sql_create_table_index)
 2009            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2010            self.conn.execute(sql_create_table_index)
 2011            for field in self.index_additionnal_fields:
 2012                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2013                self.conn.execute(sql_create_table_index)
 2014
 2015    def drop_indexes(self) -> None:
 2016        """
 2017        Create indexes on the table after insertion
 2018        """
 2019
 2020        # Access
 2021        access = self.get_config().get("access", None)
 2022
 2023        # get table variants
 2024        table_variants = self.get_table_variants("FROM")
 2025
 2026        # Get database format
 2027        connexion_format = self.get_connexion_format()
 2028
 2029        if access not in ["RO"]:
 2030            if connexion_format in ["duckdb"]:
 2031                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2032            elif connexion_format in ["sqlite"]:
 2033                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2034
 2035            list_indexes = self.conn.execute(sql_list_indexes)
 2036            index_names = [row[0] for row in list_indexes.fetchall()]
 2037            for index in index_names:
 2038                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2039                self.conn.execute(sql_drop_table_index)
 2040
 2041    def read_vcf_header(self, f) -> list:
 2042        """
 2043        It reads the header of a VCF file and returns a list of the header lines
 2044
 2045        :param f: the file object
 2046        :return: The header lines of the VCF file.
 2047        """
 2048
 2049        header_list = []
 2050        for line in f:
 2051            header_list.append(line)
 2052            if line.startswith("#CHROM"):
 2053                break
 2054        return header_list
 2055
 2056    def read_vcf_header_file(self, file: str = None) -> list:
 2057        """
 2058        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2059        uncompressed files.
 2060
 2061        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2062        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2063        default to `None`
 2064        :type file: str
 2065        :return: The function `read_vcf_header_file` returns a list.
 2066        """
 2067
 2068        if self.get_input_compressed(input_file=file):
 2069            with bgzf.open(file, "rt") as f:
 2070                return self.read_vcf_header(f=f)
 2071        else:
 2072            with open(file, "rt") as f:
 2073                return self.read_vcf_header(f=f)
 2074
 2075    def execute_query(self, query: str):
 2076        """
 2077        It takes a query as an argument, executes it, and returns the results
 2078
 2079        :param query: The query to be executed
 2080        :return: The result of the query is being returned.
 2081        """
 2082        if query:
 2083            return self.conn.execute(query)  # .fetchall()
 2084        else:
 2085            return None
 2086
 2087    def export_output(
 2088        self,
 2089        output_file: str | None = None,
 2090        output_header: str | None = None,
 2091        export_header: bool = True,
 2092        query: str | None = None,
 2093        parquet_partitions: list | None = None,
 2094        chunk_size: int | None = None,
 2095        threads: int | None = None,
 2096        sort: bool = False,
 2097        index: bool = False,
 2098        order_by: str | None = None,
 2099        fields_to_rename: dict | None = None,
 2100    ) -> bool:
 2101        """
 2102        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2103        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2104        partitioning.
 2105
 2106        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2107        output file where the exported data will be saved
 2108        :type output_file: str | None
 2109        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2110        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2111        header will be exported to a file with the same name as the `output_file` parameter, but with
 2112        the extension "
 2113        :type output_header: str | None
 2114        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2115        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2116        True, the header will be exported to a file. If `export_header` is False, the header will not
 2117        be, defaults to True
 2118        :type export_header: bool (optional)
 2119        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2120        that can be used to filter and select specific data from the VCF file before exporting it. If
 2121        provided, only the data that matches the query will be exported. This allows you to customize
 2122        the exported data based on
 2123        :type query: str | None
 2124        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2125        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2126        organize data in a hierarchical directory structure based on the values of one or more columns.
 2127        This can improve query performance when working with large datasets
 2128        :type parquet_partitions: list | None
 2129        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2130        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2131        multiple files. It helps in optimizing the export process by breaking down the data into
 2132        manageable chunks for processing and storage
 2133        :type chunk_size: int | None
 2134        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2135        threads to be used during the export process. It determines the level of parallelism and can
 2136        improve the performance of the export operation. If this parameter is not provided, the function
 2137        will use the default number of threads
 2138        :type threads: int | None
 2139        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2140        determines whether the output file should be sorted based on genomic coordinates of the
 2141        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2142        `False`,, defaults to False
 2143        :type sort: bool (optional)
 2144        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2145        determines whether an index should be created on the output file. If `index` is set to `True`,
 2146        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2147        :type index: bool (optional)
 2148        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2149        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2150        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2151        output file should be
 2152        :type order_by: str | None
 2153        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2154        mapping of field names to be renamed during the export process. This parameter allows you to
 2155        customize the output field names before exporting the data. Each key-value pair in the
 2156        dictionary represents the original field name as the key and the new field name
 2157        :type fields_to_rename: dict | None
 2158        :return: The `export_output` function returns a boolean value. It checks if the output file
 2159        exists and returns True if it does, or None if it doesn't.
 2160        """
 2161
 2162        # Log
 2163        log.info("Exporting...")
 2164
 2165        # Full path
 2166        output_file = full_path(output_file)
 2167        output_header = full_path(output_header)
 2168
 2169        # Config
 2170        config = self.get_config()
 2171
 2172        # Param
 2173        param = self.get_param()
 2174
 2175        # Tmp files to remove
 2176        tmp_to_remove = []
 2177
 2178        # If no output, get it
 2179        if not output_file:
 2180            output_file = self.get_output()
 2181
 2182        # If not threads
 2183        if not threads:
 2184            threads = self.get_threads()
 2185
 2186        # Rename fields
 2187        if not fields_to_rename:
 2188            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2189        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2190
 2191        # Auto header name with extension
 2192        if export_header or output_header:
 2193            if not output_header:
 2194                output_header = f"{output_file}.hdr"
 2195            # Export header
 2196            self.export_header(output_file=output_file)
 2197
 2198        # Switch off export header if VCF output
 2199        output_file_type = get_file_format(output_file)
 2200        if output_file_type in ["vcf"]:
 2201            export_header = False
 2202            tmp_to_remove.append(output_header)
 2203
 2204        # Chunk size
 2205        if not chunk_size:
 2206            chunk_size = config.get("chunk_size", None)
 2207
 2208        # Parquet partition
 2209        if not parquet_partitions:
 2210            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2211        if parquet_partitions and isinstance(parquet_partitions, str):
 2212            parquet_partitions = parquet_partitions.split(",")
 2213
 2214        # Order by
 2215        if not order_by:
 2216            order_by = param.get("export", {}).get("order_by", "")
 2217
 2218        # Header in output
 2219        header_in_output = param.get("export", {}).get("include_header", False)
 2220
 2221        # Database
 2222        database_source = self.get_connexion()
 2223
 2224        # Connexion format
 2225        connexion_format = self.get_connexion_format()
 2226
 2227        # Explode infos
 2228        if self.get_explode_infos():
 2229            self.explode_infos(
 2230                prefix=self.get_explode_infos_prefix(),
 2231                fields=self.get_explode_infos_fields(),
 2232                force=False,
 2233            )
 2234
 2235        # if connexion_format in ["sqlite"] or query:
 2236        if connexion_format in ["sqlite"]:
 2237
 2238            # Export in Parquet
 2239            random_tmp = "".join(
 2240                random.choice(string.ascii_lowercase) for i in range(10)
 2241            )
 2242            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2243            tmp_to_remove.append(database_source)
 2244
 2245            # Table Variants
 2246            table_variants = self.get_table_variants()
 2247
 2248            # Create export query
 2249            sql_query_export_subquery = f"""
 2250                SELECT * FROM {table_variants}
 2251                """
 2252
 2253            # Write source file
 2254            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2255
 2256        # Create database
 2257        database = Database(
 2258            database=database_source,
 2259            table="variants",
 2260            header_file=output_header,
 2261            conn_config=self.get_connexion_config(),
 2262        )
 2263
 2264        # Existing colomns header
 2265        existing_columns_header = database.get_header_columns_from_database(query=query)
 2266
 2267        # Sample list
 2268        if output_file_type in ["vcf"]:
 2269            get_samples = self.get_samples()
 2270            get_samples_check = self.get_samples_check()
 2271            samples_force = get_samples is not None
 2272            sample_list = self.get_header_sample_list(
 2273                check=get_samples_check,
 2274                samples=get_samples,
 2275                samples_force=samples_force,
 2276            )
 2277        else:
 2278            sample_list = None
 2279
 2280        # Export file
 2281        database.export(
 2282            output_database=output_file,
 2283            output_header=output_header,
 2284            existing_columns_header=existing_columns_header,
 2285            parquet_partitions=parquet_partitions,
 2286            chunk_size=chunk_size,
 2287            threads=threads,
 2288            sort=sort,
 2289            index=index,
 2290            header_in_output=header_in_output,
 2291            order_by=order_by,
 2292            query=query,
 2293            export_header=export_header,
 2294            sample_list=sample_list,
 2295        )
 2296
 2297        # Remove
 2298        remove_if_exists(tmp_to_remove)
 2299
 2300        return (os.path.exists(output_file) or None) and (
 2301            os.path.exists(output_file) or None
 2302        )
 2303
 2304    def get_extra_infos(self, table: str = None) -> list:
 2305        """
 2306        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2307        in the header.
 2308
 2309        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2310        name of the table from which you want to retrieve the extra columns that are not present in the
 2311        header. If the `table` parameter is not provided when calling the function, it will default to
 2312        using the variants
 2313        :type table: str
 2314        :return: A list of columns that are in the specified table but not in the header of the table.
 2315        """
 2316
 2317        header_columns = []
 2318
 2319        if not table:
 2320            table = self.get_table_variants(clause="from")
 2321            header_columns = self.get_header_columns()
 2322
 2323        # Check all columns in the database
 2324        query = f""" SELECT * FROM {table} LIMIT 1 """
 2325        log.debug(f"query {query}")
 2326        table_columns = self.get_query_to_df(query).columns.tolist()
 2327        extra_columns = []
 2328
 2329        # Construct extra infos (not in header)
 2330        for column in table_columns:
 2331            if column not in header_columns:
 2332                extra_columns.append(column)
 2333
 2334        return extra_columns
 2335
 2336    def get_extra_infos_sql(self, table: str = None) -> str:
 2337        """
 2338        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2339        by double quotes
 2340
 2341        :param table: The name of the table to get the extra infos from. If None, the default table is
 2342        used
 2343        :type table: str
 2344        :return: A string of the extra infos
 2345        """
 2346
 2347        return ", ".join(
 2348            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2349        )
 2350
 2351    def export_header(
 2352        self,
 2353        header_name: str = None,
 2354        output_file: str = None,
 2355        output_file_ext: str = ".hdr",
 2356        clean_header: bool = True,
 2357        remove_chrom_line: bool = False,
 2358    ) -> str:
 2359        """
 2360        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2361        specified options, and writes it to a new file.
 2362
 2363        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2364        this parameter is not specified, the header will be written to the output file
 2365        :type header_name: str
 2366        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2367        specify the name of the output file where the header will be written. If this parameter is not
 2368        provided, the header will be written to a temporary file
 2369        :type output_file: str
 2370        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2371        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2372        if not specified by the user. This extension will be appended to the `output_file` name to
 2373        create the final, defaults to .hdr
 2374        :type output_file_ext: str (optional)
 2375        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2376        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2377        `True`, the function will clean the header by modifying certain lines based on a specific
 2378        pattern. If `clean_header`, defaults to True
 2379        :type clean_header: bool (optional)
 2380        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2381        boolean flag that determines whether the #CHROM line should be removed from the header before
 2382        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2383        defaults to False
 2384        :type remove_chrom_line: bool (optional)
 2385        :return: The function `export_header` returns the name of the temporary header file that is
 2386        created.
 2387        """
 2388
 2389        if not header_name and not output_file:
 2390            output_file = self.get_output()
 2391
 2392        if self.get_header():
 2393
 2394            # Get header object
 2395            header_obj = self.get_header()
 2396
 2397            # Create database
 2398            db_for_header = Database(database=self.get_input())
 2399
 2400            # Get real columns in the file
 2401            db_header_columns = db_for_header.get_columns()
 2402
 2403            with tempfile.TemporaryDirectory() as tmpdir:
 2404
 2405                # Write header file
 2406                header_file_tmp = os.path.join(tmpdir, "header")
 2407                f = open(header_file_tmp, "w")
 2408                vcf.Writer(f, header_obj)
 2409                f.close()
 2410
 2411                # Replace #CHROM line with rel columns
 2412                header_list = db_for_header.read_header_file(
 2413                    header_file=header_file_tmp
 2414                )
 2415                header_list[-1] = "\t".join(db_header_columns)
 2416
 2417                # Remove CHROM line
 2418                if remove_chrom_line:
 2419                    header_list.pop()
 2420
 2421                # Clean header
 2422                if clean_header:
 2423                    header_list_clean = []
 2424                    for head in header_list:
 2425                        # Clean head for malformed header
 2426                        head_clean = head
 2427                        head_clean = re.subn(
 2428                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2429                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2430                            head_clean,
 2431                            2,
 2432                        )[0]
 2433                        # Write header
 2434                        header_list_clean.append(head_clean)
 2435                    header_list = header_list_clean
 2436
 2437            tmp_header_name = output_file + output_file_ext
 2438
 2439            f = open(tmp_header_name, "w")
 2440            for line in header_list:
 2441                f.write(line)
 2442            f.close()
 2443
 2444        return tmp_header_name
 2445
 2446    def export_variant_vcf(
 2447        self,
 2448        vcf_file,
 2449        remove_info: bool = False,
 2450        add_samples: bool = True,
 2451        list_samples: list = [],
 2452        where_clause: str = "",
 2453        index: bool = False,
 2454        threads: int | None = None,
 2455    ) -> bool | None:
 2456        """
 2457        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2458        remove INFO field, add samples, and control compression and indexing.
 2459
 2460        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2461        written to. It is the output file that will contain the filtered VCF data based on the specified
 2462        parameters
 2463        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2464        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2465        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2466        in, defaults to False
 2467        :type remove_info: bool (optional)
 2468        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2469        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2470        If set to False, the samples will be removed. The default value is True, defaults to True
 2471        :type add_samples: bool (optional)
 2472        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2473        in the output VCF file. By default, all samples will be included. If you provide a list of
 2474        samples, only those samples will be included in the output file
 2475        :type list_samples: list
 2476        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2477        determines whether or not to create an index for the output VCF file. If `index` is set to
 2478        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2479        :type index: bool (optional)
 2480        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2481        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2482        will be used during the export process. More threads can potentially speed up the export process
 2483        by utilizing multiple cores of the processor. If
 2484        :type threads: int | None
 2485        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2486        method with various parameters including the output file, query, threads, sort flag, and index
 2487        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2488        specified parameters and configurations provided in the `export_variant_vcf` function.
 2489        """
 2490
 2491        # Config
 2492        config = self.get_config()
 2493
 2494        # Extract VCF
 2495        log.debug("Export VCF...")
 2496
 2497        # Table variants
 2498        table_variants = self.get_table_variants()
 2499
 2500        # Threads
 2501        if not threads:
 2502            threads = self.get_threads()
 2503
 2504        # Info fields
 2505        if remove_info:
 2506            if not isinstance(remove_info, str):
 2507                remove_info = "."
 2508            info_field = f"""'{remove_info}' as INFO"""
 2509        else:
 2510            info_field = "INFO"
 2511
 2512        # Samples fields
 2513        if add_samples:
 2514            if not list_samples:
 2515                list_samples = self.get_header_sample_list()
 2516            if list_samples:
 2517                samples_fields = " , FORMAT , " + " , ".join(
 2518                    [f""" "{sample}" """ for sample in list_samples]
 2519                )
 2520            else:
 2521                samples_fields = ""
 2522            log.debug(f"samples_fields: {samples_fields}")
 2523        else:
 2524            samples_fields = ""
 2525
 2526        # Where clause
 2527        if where_clause is None:
 2528            where_clause = ""
 2529
 2530        # Variants
 2531        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2532        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2533        log.debug(f"sql_query_select={sql_query_select}")
 2534
 2535        return self.export_output(
 2536            output_file=vcf_file,
 2537            output_header=None,
 2538            export_header=True,
 2539            query=sql_query_select,
 2540            parquet_partitions=None,
 2541            chunk_size=config.get("chunk_size", None),
 2542            threads=threads,
 2543            sort=True,
 2544            index=index,
 2545            order_by=None,
 2546        )
 2547
 2548    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2549        """
 2550        It takes a list of commands and runs them in parallel using the number of threads specified
 2551
 2552        :param commands: A list of commands to run
 2553        :param threads: The number of threads to use, defaults to 1 (optional)
 2554        """
 2555
 2556        run_parallel_commands(commands, threads)
 2557
 2558    def get_threads(self, default: int = 1) -> int:
 2559        """
 2560        This function returns the number of threads to use for a job, with a default value of 1 if not
 2561        specified.
 2562
 2563        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2564        default number of threads to use if no specific value is provided. If no value is provided for
 2565        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2566        used, defaults to 1
 2567        :type default: int (optional)
 2568        :return: the number of threads to use for the current job.
 2569        """
 2570
 2571        # Config
 2572        config = self.get_config()
 2573
 2574        # Param
 2575        param = self.get_param()
 2576
 2577        # Input threads
 2578        input_thread = param.get("threads", config.get("threads", None))
 2579
 2580        # Check threads
 2581        if not input_thread:
 2582            threads = default
 2583        elif int(input_thread) <= 0:
 2584            threads = os.cpu_count()
 2585        else:
 2586            threads = int(input_thread)
 2587        return threads
 2588
 2589    def get_memory(self, default: str = None) -> str:
 2590        """
 2591        This function retrieves the memory value from parameters or configuration with a default value
 2592        if not found.
 2593
 2594        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2595        default value is used as a fallback in case the `memory` parameter is not provided in the
 2596        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2597        the function
 2598        :type default: str
 2599        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2600        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2601        return the default value provided as an argument to the function.
 2602        """
 2603
 2604        # Config
 2605        config = self.get_config()
 2606
 2607        # Param
 2608        param = self.get_param()
 2609
 2610        # Input threads
 2611        input_memory = param.get("memory", config.get("memory", None))
 2612
 2613        # Check threads
 2614        if input_memory:
 2615            memory = input_memory
 2616        else:
 2617            memory = default
 2618
 2619        return memory
 2620
 2621    def update_from_vcf(self, vcf_file: str) -> None:
 2622        """
 2623        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2624
 2625        :param vcf_file: the path to the VCF file
 2626        """
 2627
 2628        connexion_format = self.get_connexion_format()
 2629
 2630        if connexion_format in ["duckdb"]:
 2631            self.update_from_vcf_duckdb(vcf_file)
 2632        elif connexion_format in ["sqlite"]:
 2633            self.update_from_vcf_sqlite(vcf_file)
 2634
 2635    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2636        """
 2637        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2638        INFO column of the VCF file
 2639
 2640        :param vcf_file: the path to the VCF file
 2641        """
 2642
 2643        # varaints table
 2644        table_variants = self.get_table_variants()
 2645
 2646        # Loading VCF into temporaire table
 2647        skip = self.get_header_length(file=vcf_file)
 2648        vcf_df = pd.read_csv(
 2649            vcf_file,
 2650            sep="\t",
 2651            engine="c",
 2652            skiprows=skip,
 2653            header=0,
 2654            low_memory=False,
 2655        )
 2656        sql_query_update = f"""
 2657        UPDATE {table_variants} as table_variants
 2658            SET INFO = concat(
 2659                            CASE
 2660                                WHEN INFO NOT IN ('', '.')
 2661                                THEN INFO
 2662                                ELSE ''
 2663                            END,
 2664                            (
 2665                                SELECT 
 2666                                    concat(
 2667                                        CASE
 2668                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2669                                            THEN ';'
 2670                                            ELSE ''
 2671                                        END
 2672                                        ,
 2673                                        CASE
 2674                                            WHEN table_parquet.INFO NOT IN ('','.')
 2675                                            THEN table_parquet.INFO
 2676                                            ELSE ''
 2677                                        END
 2678                                    )
 2679                                FROM vcf_df as table_parquet
 2680                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2681                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2682                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2683                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2684                                        AND table_parquet.INFO NOT IN ('','.')
 2685                            )
 2686                        )
 2687            ;
 2688            """
 2689        self.conn.execute(sql_query_update)
 2690
 2691    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2692        """
 2693        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2694        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2695        table
 2696
 2697        :param vcf_file: The path to the VCF file you want to update the database with
 2698        """
 2699
 2700        # Create a temporary table for the VCF
 2701        table_vcf = "tmp_vcf"
 2702        sql_create = (
 2703            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2704        )
 2705        self.conn.execute(sql_create)
 2706
 2707        # Loading VCF into temporaire table
 2708        vcf_df = pd.read_csv(
 2709            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2710        )
 2711        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2712        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2713
 2714        # Update table 'variants' with VCF data
 2715        # warning: CONCAT as || operator
 2716        sql_query_update = f"""
 2717            UPDATE variants as table_variants
 2718            SET INFO = CASE
 2719                            WHEN INFO NOT IN ('', '.')
 2720                            THEN INFO
 2721                            ELSE ''
 2722                        END ||
 2723                        (
 2724                        SELECT 
 2725                            CASE 
 2726                                WHEN table_variants.INFO NOT IN ('','.') 
 2727                                    AND table_vcf.INFO NOT IN ('','.')  
 2728                                THEN ';' 
 2729                                ELSE '' 
 2730                            END || 
 2731                            CASE 
 2732                                WHEN table_vcf.INFO NOT IN ('','.') 
 2733                                THEN table_vcf.INFO 
 2734                                ELSE '' 
 2735                            END
 2736                        FROM {table_vcf} as table_vcf
 2737                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2738                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2739                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2740                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2741                        )
 2742        """
 2743        self.conn.execute(sql_query_update)
 2744
 2745        # Drop temporary table
 2746        sql_drop = f"DROP TABLE {table_vcf}"
 2747        self.conn.execute(sql_drop)
 2748
 2749    def drop_variants_table(self) -> None:
 2750        """
 2751        > This function drops the variants table
 2752        """
 2753
 2754        table_variants = self.get_table_variants()
 2755        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2756        self.conn.execute(sql_table_variants)
 2757
 2758    def set_variant_id(
 2759        self, variant_id_column: str = "variant_id", force: bool = None
 2760    ) -> str:
 2761        """
 2762        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2763        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2764
 2765        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2766        to variant_id
 2767        :type variant_id_column: str (optional)
 2768        :param force: If True, the variant_id column will be created even if it already exists
 2769        :type force: bool
 2770        :return: The name of the column that contains the variant_id
 2771        """
 2772
 2773        # Assembly
 2774        assembly = self.get_param().get(
 2775            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2776        )
 2777
 2778        # INFO/Tag prefix
 2779        prefix = self.get_explode_infos_prefix()
 2780
 2781        # Explode INFO/SVTYPE
 2782        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2783
 2784        # variants table
 2785        table_variants = self.get_table_variants()
 2786
 2787        # variant_id column
 2788        if not variant_id_column:
 2789            variant_id_column = "variant_id"
 2790
 2791        # Creta variant_id column
 2792        if "variant_id" not in self.get_extra_infos() or force:
 2793
 2794            # Create column
 2795            self.add_column(
 2796                table_name=table_variants,
 2797                column_name=variant_id_column,
 2798                column_type="UBIGINT",
 2799                default_value="0",
 2800            )
 2801
 2802            # Update column
 2803            self.conn.execute(
 2804                f"""
 2805                    UPDATE {table_variants}
 2806                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2807                """
 2808            )
 2809
 2810        # Remove added columns
 2811        for added_column in added_columns:
 2812            self.drop_column(column=added_column)
 2813
 2814        # return variant_id column name
 2815        return variant_id_column
 2816
 2817    def get_variant_id_column(
 2818        self, variant_id_column: str = "variant_id", force: bool = None
 2819    ) -> str:
 2820        """
 2821        This function returns the variant_id column name
 2822
 2823        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2824        defaults to variant_id
 2825        :type variant_id_column: str (optional)
 2826        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2827        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2828        if it is not already set, or if it is set
 2829        :type force: bool
 2830        :return: The variant_id column name.
 2831        """
 2832
 2833        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2834
 2835    ###
 2836    # Annotation
 2837    ###
 2838
 2839    def scan_databases(
 2840        self,
 2841        database_formats: list = ["parquet"],
 2842        database_releases: list = ["current"],
 2843    ) -> dict:
 2844        """
 2845        The function `scan_databases` scans for available databases based on specified formats and
 2846        releases.
 2847
 2848        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2849        of the databases to be scanned. In this case, the accepted format is "parquet"
 2850        :type database_formats: list ["parquet"]
 2851        :param database_releases: The `database_releases` parameter is a list that specifies the
 2852        releases of the databases to be scanned. In the provided function, the default value for
 2853        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2854        databases that are in the "current"
 2855        :type database_releases: list
 2856        :return: The function `scan_databases` returns a dictionary containing information about
 2857        databases that match the specified formats and releases.
 2858        """
 2859
 2860        # Config
 2861        config = self.get_config()
 2862
 2863        # Param
 2864        param = self.get_param()
 2865
 2866        # Param - Assembly
 2867        assembly = param.get("assembly", config.get("assembly", None))
 2868        if not assembly:
 2869            assembly = DEFAULT_ASSEMBLY
 2870            log.warning(f"Default assembly '{assembly}'")
 2871
 2872        # Scan for availabled databases
 2873        log.info(
 2874            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2875        )
 2876        databases_infos_dict = databases_infos(
 2877            database_folder_releases=database_releases,
 2878            database_formats=database_formats,
 2879            assembly=assembly,
 2880            config=config,
 2881        )
 2882        log.info(
 2883            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2884        )
 2885
 2886        return databases_infos_dict
 2887
 2888    def annotation(self) -> None:
 2889        """
 2890        It annotates the VCF file with the annotations specified in the config file.
 2891        """
 2892
 2893        # Config
 2894        config = self.get_config()
 2895
 2896        # Param
 2897        param = self.get_param()
 2898
 2899        # Param - Assembly
 2900        assembly = param.get("assembly", config.get("assembly", None))
 2901        if not assembly:
 2902            assembly = DEFAULT_ASSEMBLY
 2903            log.warning(f"Default assembly '{assembly}'")
 2904
 2905        # annotations databases folders
 2906        annotations_databases = set(
 2907            config.get("folders", {})
 2908            .get("databases", {})
 2909            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2910            + config.get("folders", {})
 2911            .get("databases", {})
 2912            .get("parquet", ["~/howard/databases/parquet/current"])
 2913            + config.get("folders", {})
 2914            .get("databases", {})
 2915            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2916        )
 2917
 2918        # Get param annotations
 2919        if param.get("annotations", None) and isinstance(
 2920            param.get("annotations", None), str
 2921        ):
 2922            log.debug(param.get("annotations", None))
 2923            param_annotation_list = param.get("annotations").split(",")
 2924        else:
 2925            param_annotation_list = []
 2926
 2927        # Each tools param
 2928        if param.get("annotation_parquet", None) != None:
 2929            log.debug(
 2930                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2931            )
 2932            if isinstance(param.get("annotation_parquet", None), list):
 2933                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2934            else:
 2935                param_annotation_list.append(param.get("annotation_parquet"))
 2936        if param.get("annotation_snpsift", None) != None:
 2937            if isinstance(param.get("annotation_snpsift", None), list):
 2938                param_annotation_list.append(
 2939                    "snpsift:"
 2940                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2941                )
 2942            else:
 2943                param_annotation_list.append(
 2944                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2945                )
 2946        if param.get("annotation_snpeff", None) != None:
 2947            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2948        if param.get("annotation_bcftools", None) != None:
 2949            if isinstance(param.get("annotation_bcftools", None), list):
 2950                param_annotation_list.append(
 2951                    "bcftools:"
 2952                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2953                )
 2954            else:
 2955                param_annotation_list.append(
 2956                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2957                )
 2958        if param.get("annotation_annovar", None) != None:
 2959            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2960        if param.get("annotation_exomiser", None) != None:
 2961            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2962        if param.get("annotation_splice", None) != None:
 2963            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2964
 2965        # Merge param annotations list
 2966        param["annotations"] = ",".join(param_annotation_list)
 2967
 2968        # debug
 2969        log.debug(f"param_annotations={param['annotations']}")
 2970
 2971        if param.get("annotations"):
 2972
 2973            # Log
 2974            # log.info("Annotations - Check annotation parameters")
 2975
 2976            if not "annotation" in param:
 2977                param["annotation"] = {}
 2978
 2979            # List of annotations parameters
 2980            annotations_list_input = {}
 2981            if isinstance(param.get("annotations", None), str):
 2982                annotation_file_list = [
 2983                    value for value in param.get("annotations", "").split(",")
 2984                ]
 2985                for annotation_file in annotation_file_list:
 2986                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2987            else:
 2988                annotations_list_input = param.get("annotations", {})
 2989
 2990            log.info(f"Quick Annotations:")
 2991            for annotation_key in list(annotations_list_input.keys()):
 2992                log.info(f"   {annotation_key}")
 2993
 2994            # List of annotations and associated fields
 2995            annotations_list = {}
 2996
 2997            for annotation_file in annotations_list_input:
 2998
 2999                # Explode annotations if ALL
 3000                if (
 3001                    annotation_file.upper() == "ALL"
 3002                    or annotation_file.upper().startswith("ALL:")
 3003                ):
 3004
 3005                    # check ALL parameters (formats, releases)
 3006                    annotation_file_split = annotation_file.split(":")
 3007                    database_formats = "parquet"
 3008                    database_releases = "current"
 3009                    for annotation_file_option in annotation_file_split[1:]:
 3010                        database_all_options_split = annotation_file_option.split("=")
 3011                        if database_all_options_split[0] == "format":
 3012                            database_formats = database_all_options_split[1].split("+")
 3013                        if database_all_options_split[0] == "release":
 3014                            database_releases = database_all_options_split[1].split("+")
 3015
 3016                    # Scan for availabled databases
 3017                    databases_infos_dict = self.scan_databases(
 3018                        database_formats=database_formats,
 3019                        database_releases=database_releases,
 3020                    )
 3021
 3022                    # Add found databases in annotation parameters
 3023                    for database_infos in databases_infos_dict.keys():
 3024                        annotations_list[database_infos] = {"INFO": None}
 3025
 3026                else:
 3027                    annotations_list[annotation_file] = annotations_list_input[
 3028                        annotation_file
 3029                    ]
 3030
 3031            # Check each databases
 3032            if len(annotations_list):
 3033
 3034                log.info(
 3035                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3036                )
 3037
 3038                for annotation_file in annotations_list:
 3039
 3040                    # Init
 3041                    annotations = annotations_list.get(annotation_file, None)
 3042
 3043                    # Annotation snpEff
 3044                    if annotation_file.startswith("snpeff"):
 3045
 3046                        log.debug(f"Quick Annotation snpEff")
 3047
 3048                        if "snpeff" not in param["annotation"]:
 3049                            param["annotation"]["snpeff"] = {}
 3050
 3051                        if "options" not in param["annotation"]["snpeff"]:
 3052                            param["annotation"]["snpeff"]["options"] = ""
 3053
 3054                        # snpEff options in annotations
 3055                        param["annotation"]["snpeff"]["options"] = "".join(
 3056                            annotation_file.split(":")[1:]
 3057                        )
 3058
 3059                    # Annotation Annovar
 3060                    elif annotation_file.startswith("annovar"):
 3061
 3062                        log.debug(f"Quick Annotation Annovar")
 3063
 3064                        if "annovar" not in param["annotation"]:
 3065                            param["annotation"]["annovar"] = {}
 3066
 3067                        if "annotations" not in param["annotation"]["annovar"]:
 3068                            param["annotation"]["annovar"]["annotations"] = {}
 3069
 3070                        # Options
 3071                        annotation_file_split = annotation_file.split(":")
 3072                        for annotation_file_annotation in annotation_file_split[1:]:
 3073                            if annotation_file_annotation:
 3074                                param["annotation"]["annovar"]["annotations"][
 3075                                    annotation_file_annotation
 3076                                ] = annotations
 3077
 3078                    # Annotation Exomiser
 3079                    elif annotation_file.startswith("exomiser"):
 3080
 3081                        log.debug(f"Quick Annotation Exomiser")
 3082
 3083                        param["annotation"]["exomiser"] = params_string_to_dict(
 3084                            annotation_file
 3085                        )
 3086
 3087                    # Annotation Splice
 3088                    elif annotation_file.startswith("splice"):
 3089
 3090                        log.debug(f"Quick Annotation Splice")
 3091
 3092                        param["annotation"]["splice"] = params_string_to_dict(
 3093                            annotation_file
 3094                        )
 3095
 3096                    # Annotation Parquet or BCFTOOLS
 3097                    else:
 3098
 3099                        # Tools detection
 3100                        if annotation_file.startswith("bcftools:"):
 3101                            annotation_tool_initial = "bcftools"
 3102                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3103                        elif annotation_file.startswith("snpsift:"):
 3104                            annotation_tool_initial = "snpsift"
 3105                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3106                        elif annotation_file.startswith("bigwig:"):
 3107                            annotation_tool_initial = "bigwig"
 3108                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3109                        else:
 3110                            annotation_tool_initial = None
 3111
 3112                        # list of files
 3113                        annotation_file_list = annotation_file.replace("+", ":").split(
 3114                            ":"
 3115                        )
 3116
 3117                        for annotation_file in annotation_file_list:
 3118
 3119                            if annotation_file:
 3120
 3121                                # Annotation tool initial
 3122                                annotation_tool = annotation_tool_initial
 3123
 3124                                # Find file
 3125                                annotation_file_found = None
 3126
 3127                                if os.path.exists(annotation_file):
 3128                                    annotation_file_found = annotation_file
 3129                                elif os.path.exists(full_path(annotation_file)):
 3130                                    annotation_file_found = full_path(annotation_file)
 3131                                else:
 3132                                    # Find within assembly folders
 3133                                    for annotations_database in annotations_databases:
 3134                                        found_files = find_all(
 3135                                            annotation_file,
 3136                                            os.path.join(
 3137                                                annotations_database, assembly
 3138                                            ),
 3139                                        )
 3140                                        if len(found_files) > 0:
 3141                                            annotation_file_found = found_files[0]
 3142                                            break
 3143                                    if not annotation_file_found and not assembly:
 3144                                        # Find within folders
 3145                                        for (
 3146                                            annotations_database
 3147                                        ) in annotations_databases:
 3148                                            found_files = find_all(
 3149                                                annotation_file, annotations_database
 3150                                            )
 3151                                            if len(found_files) > 0:
 3152                                                annotation_file_found = found_files[0]
 3153                                                break
 3154                                log.debug(
 3155                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3156                                )
 3157
 3158                                # Full path
 3159                                annotation_file_found = full_path(annotation_file_found)
 3160
 3161                                if annotation_file_found:
 3162
 3163                                    database = Database(database=annotation_file_found)
 3164                                    quick_annotation_format = database.get_format()
 3165                                    quick_annotation_is_compressed = (
 3166                                        database.is_compressed()
 3167                                    )
 3168                                    quick_annotation_is_indexed = os.path.exists(
 3169                                        f"{annotation_file_found}.tbi"
 3170                                    )
 3171                                    bcftools_preference = False
 3172
 3173                                    # Check Annotation Tool
 3174                                    if not annotation_tool:
 3175                                        if (
 3176                                            bcftools_preference
 3177                                            and quick_annotation_format
 3178                                            in ["vcf", "bed"]
 3179                                            and quick_annotation_is_compressed
 3180                                            and quick_annotation_is_indexed
 3181                                        ):
 3182                                            annotation_tool = "bcftools"
 3183                                        elif quick_annotation_format in [
 3184                                            "vcf",
 3185                                            "bed",
 3186                                            "tsv",
 3187                                            "tsv",
 3188                                            "csv",
 3189                                            "json",
 3190                                            "tbl",
 3191                                            "parquet",
 3192                                            "duckdb",
 3193                                        ]:
 3194                                            annotation_tool = "parquet"
 3195                                        elif quick_annotation_format in ["bw"]:
 3196                                            annotation_tool = "bigwig"
 3197                                        else:
 3198                                            log.error(
 3199                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3200                                            )
 3201                                            raise ValueError(
 3202                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3203                                            )
 3204
 3205                                    log.debug(
 3206                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3207                                    )
 3208
 3209                                    # Annotation Tool dispatch
 3210                                    if annotation_tool:
 3211                                        if annotation_tool not in param["annotation"]:
 3212                                            param["annotation"][annotation_tool] = {}
 3213                                        if (
 3214                                            "annotations"
 3215                                            not in param["annotation"][annotation_tool]
 3216                                        ):
 3217                                            param["annotation"][annotation_tool][
 3218                                                "annotations"
 3219                                            ] = {}
 3220                                        param["annotation"][annotation_tool][
 3221                                            "annotations"
 3222                                        ][annotation_file_found] = annotations
 3223
 3224                                else:
 3225                                    log.warning(
 3226                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3227                                    )
 3228
 3229                self.set_param(param)
 3230
 3231        if param.get("annotation", None):
 3232            log.info("Annotations")
 3233            if param.get("annotation", {}).get("parquet", None):
 3234                log.info("Annotations 'parquet'...")
 3235                self.annotation_parquet()
 3236            if param.get("annotation", {}).get("bcftools", None):
 3237                log.info("Annotations 'bcftools'...")
 3238                self.annotation_bcftools()
 3239            if param.get("annotation", {}).get("snpsift", None):
 3240                log.info("Annotations 'snpsift'...")
 3241                self.annotation_snpsift()
 3242            if param.get("annotation", {}).get("bigwig", None):
 3243                log.info("Annotations 'bigwig'...")
 3244                self.annotation_bigwig()
 3245            if param.get("annotation", {}).get("annovar", None):
 3246                log.info("Annotations 'annovar'...")
 3247                self.annotation_annovar()
 3248            if param.get("annotation", {}).get("snpeff", None):
 3249                log.info("Annotations 'snpeff'...")
 3250                self.annotation_snpeff()
 3251            if param.get("annotation", {}).get("exomiser", None) is not None:
 3252                log.info("Annotations 'exomiser'...")
 3253                self.annotation_exomiser()
 3254            if param.get("annotation", {}).get("splice", None) is not None:
 3255                log.info("Annotations 'splice' ...")
 3256                self.annotation_splice()
 3257
 3258        # Explode INFOS fields into table fields
 3259        if self.get_explode_infos():
 3260            self.explode_infos(
 3261                prefix=self.get_explode_infos_prefix(),
 3262                fields=self.get_explode_infos_fields(),
 3263                force=True,
 3264            )
 3265
 3266    def annotation_bigwig(self, threads: int = None) -> None:
 3267        """
 3268        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3269
 3270        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3271        number of threads to be used for parallel processing during the annotation process. If the
 3272        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3273        threads to use based on the system configuration
 3274        :type threads: int
 3275        :return: True
 3276        """
 3277
 3278        # DEBUG
 3279        log.debug("Start annotation with bigwig databases")
 3280
 3281        # # Threads
 3282        # if not threads:
 3283        #     threads = self.get_threads()
 3284        # log.debug("Threads: " + str(threads))
 3285
 3286        # Config
 3287        config = self.get_config()
 3288        log.debug("Config: " + str(config))
 3289
 3290        # Config - BCFTools databases folders
 3291        databases_folders = set(
 3292            self.get_config()
 3293            .get("folders", {})
 3294            .get("databases", {})
 3295            .get("annotations", ["."])
 3296            + self.get_config()
 3297            .get("folders", {})
 3298            .get("databases", {})
 3299            .get("bigwig", ["."])
 3300        )
 3301        log.debug("Databases annotations: " + str(databases_folders))
 3302
 3303        # Param
 3304        annotations = (
 3305            self.get_param()
 3306            .get("annotation", {})
 3307            .get("bigwig", {})
 3308            .get("annotations", None)
 3309        )
 3310        log.debug("Annotations: " + str(annotations))
 3311
 3312        # Assembly
 3313        assembly = self.get_param().get(
 3314            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3315        )
 3316
 3317        # Data
 3318        table_variants = self.get_table_variants()
 3319
 3320        # Check if not empty
 3321        log.debug("Check if not empty")
 3322        sql_query_chromosomes = (
 3323            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3324        )
 3325        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3326        if not sql_query_chromosomes_df["count"][0]:
 3327            log.info(f"VCF empty")
 3328            return
 3329
 3330        # VCF header
 3331        vcf_reader = self.get_header()
 3332        log.debug("Initial header: " + str(vcf_reader.infos))
 3333
 3334        # Existing annotations
 3335        for vcf_annotation in self.get_header().infos:
 3336
 3337            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3338            log.debug(
 3339                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3340            )
 3341
 3342        if annotations:
 3343
 3344            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3345
 3346                # Export VCF file
 3347                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3348
 3349                # annotation_bigwig_config
 3350                annotation_bigwig_config_list = []
 3351
 3352                for annotation in annotations:
 3353                    annotation_fields = annotations[annotation]
 3354
 3355                    # Annotation Name
 3356                    annotation_name = os.path.basename(annotation)
 3357
 3358                    if not annotation_fields:
 3359                        annotation_fields = {"INFO": None}
 3360
 3361                    log.debug(f"Annotation '{annotation_name}'")
 3362                    log.debug(
 3363                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3364                    )
 3365
 3366                    # Create Database
 3367                    database = Database(
 3368                        database=annotation,
 3369                        databases_folders=databases_folders,
 3370                        assembly=assembly,
 3371                    )
 3372
 3373                    # Find files
 3374                    db_file = database.get_database()
 3375                    db_file = full_path(db_file)
 3376                    db_hdr_file = database.get_header_file()
 3377                    db_hdr_file = full_path(db_hdr_file)
 3378                    db_file_type = database.get_format()
 3379
 3380                    # If db_file is http ?
 3381                    if database.get_database().startswith("http"):
 3382
 3383                        # Datbase is HTTP URL
 3384                        db_file_is_http = True
 3385
 3386                        # DB file keep as URL
 3387                        db_file = database.get_database()
 3388                        log.warning(
 3389                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3390                        )
 3391
 3392                        # Retrieve automatic annotation field name
 3393                        annotation_field = clean_annotation_field(
 3394                            os.path.basename(db_file).replace(".bw", "")
 3395                        )
 3396                        log.debug(
 3397                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3398                        )
 3399
 3400                        # Create automatic header file
 3401                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3402                        with open(db_hdr_file, "w") as f:
 3403                            f.write("##fileformat=VCFv4.2\n")
 3404                            f.write(
 3405                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3406                            )
 3407                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3408
 3409                    else:
 3410
 3411                        # Datbase is NOT HTTP URL
 3412                        db_file_is_http = False
 3413
 3414                    # Check index - try to create if not exists
 3415                    if (
 3416                        db_file is None
 3417                        or db_hdr_file is None
 3418                        or (not os.path.exists(db_file) and not db_file_is_http)
 3419                        or not os.path.exists(db_hdr_file)
 3420                        or not db_file_type in ["bw"]
 3421                    ):
 3422                        # if False:
 3423                        log.error("Annotation failed: database not valid")
 3424                        log.error(f"Annotation annotation file: {db_file}")
 3425                        log.error(f"Annotation annotation file type: {db_file_type}")
 3426                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3427                        raise ValueError(
 3428                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3429                        )
 3430                    else:
 3431
 3432                        # Log
 3433                        log.debug(
 3434                            f"Annotation '{annotation}' - file: "
 3435                            + str(db_file)
 3436                            + " and "
 3437                            + str(db_hdr_file)
 3438                        )
 3439
 3440                        # Load header as VCF object
 3441                        db_hdr_vcf = Variants(input=db_hdr_file)
 3442                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3443                        log.debug(
 3444                            "Annotation database header: "
 3445                            + str(db_hdr_vcf_header_infos)
 3446                        )
 3447
 3448                        # For all fields in database
 3449                        annotation_fields_full = False
 3450                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3451                            annotation_fields = {
 3452                                key: key for key in db_hdr_vcf_header_infos
 3453                            }
 3454                            log.debug(
 3455                                "Annotation database header - All annotations added: "
 3456                                + str(annotation_fields)
 3457                            )
 3458                            annotation_fields_full = True
 3459
 3460                        # Init
 3461                        cyvcf2_header_rename_dict = {}
 3462                        cyvcf2_header_list = []
 3463                        cyvcf2_header_indexes = {}
 3464
 3465                        # process annotation fields
 3466                        for annotation_field in annotation_fields:
 3467
 3468                            # New annotation name
 3469                            annotation_field_new = annotation_fields[annotation_field]
 3470
 3471                            # Check annotation field and index in header
 3472                            if (
 3473                                annotation_field
 3474                                in db_hdr_vcf.get_header_columns_as_list()
 3475                            ):
 3476                                annotation_field_index = (
 3477                                    db_hdr_vcf.get_header_columns_as_list().index(
 3478                                        annotation_field
 3479                                    )
 3480                                    - 3
 3481                                )
 3482                                cyvcf2_header_indexes[annotation_field_new] = (
 3483                                    annotation_field_index
 3484                                )
 3485                            else:
 3486                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3487                                log.error(msg_err)
 3488                                raise ValueError(msg_err)
 3489
 3490                            # Append annotation field in cyvcf2 header list
 3491                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3492                                db_hdr_vcf_header_infos[annotation_field].id
 3493                            )
 3494                            cyvcf2_header_list.append(
 3495                                {
 3496                                    "ID": annotation_field_new,
 3497                                    "Number": db_hdr_vcf_header_infos[
 3498                                        annotation_field
 3499                                    ].num,
 3500                                    "Type": db_hdr_vcf_header_infos[
 3501                                        annotation_field
 3502                                    ].type,
 3503                                    "Description": db_hdr_vcf_header_infos[
 3504                                        annotation_field
 3505                                    ].desc,
 3506                                }
 3507                            )
 3508
 3509                            # Add header on VCF
 3510                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3511                                annotation_field_new,
 3512                                db_hdr_vcf_header_infos[annotation_field].num,
 3513                                db_hdr_vcf_header_infos[annotation_field].type,
 3514                                db_hdr_vcf_header_infos[annotation_field].desc,
 3515                                "HOWARD BigWig annotation",
 3516                                "unknown",
 3517                                self.code_type_map[
 3518                                    db_hdr_vcf_header_infos[annotation_field].type
 3519                                ],
 3520                            )
 3521
 3522                        # Load bigwig database
 3523                        bw_db = pyBigWig.open(db_file)
 3524                        if bw_db.isBigWig():
 3525                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3526                        else:
 3527                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3528                            log.error(msg_err)
 3529                            raise ValueError(msg_err)
 3530
 3531                        annotation_bigwig_config_list.append(
 3532                            {
 3533                                "db_file": db_file,
 3534                                "bw_db": bw_db,
 3535                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3536                                "cyvcf2_header_list": cyvcf2_header_list,
 3537                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3538                            }
 3539                        )
 3540
 3541                # Annotate
 3542                if annotation_bigwig_config_list:
 3543
 3544                    # Annotation config
 3545                    log.debug(
 3546                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3547                    )
 3548
 3549                    # Export VCF file
 3550                    self.export_variant_vcf(
 3551                        vcf_file=tmp_vcf_name,
 3552                        remove_info=True,
 3553                        add_samples=False,
 3554                        index=True,
 3555                    )
 3556
 3557                    # Load input tmp file
 3558                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3559
 3560                    # Add header in input file
 3561                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3562                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3563                            "cyvcf2_header_list", []
 3564                        ):
 3565                            log.info(
 3566                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3567                            )
 3568                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3569
 3570                    # Create output VCF file
 3571                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3572                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3573
 3574                    # Fetch variants
 3575                    log.info(f"Annotations 'bigwig' start...")
 3576                    for variant in input_vcf:
 3577
 3578                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3579
 3580                            # DB and indexes
 3581                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3582                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3583                                "cyvcf2_header_indexes", None
 3584                            )
 3585
 3586                            # Retrieve value from chrom pos
 3587                            res = bw_db.values(
 3588                                variant.CHROM, variant.POS - 1, variant.POS
 3589                            )
 3590
 3591                            # For each annotation fields (and indexes)
 3592                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3593
 3594                                # If value is NOT nNone
 3595                                if not np.isnan(
 3596                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3597                                ):
 3598                                    variant.INFO[cyvcf2_header_index] = res[
 3599                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3600                                    ]
 3601
 3602                        # Add record in output file
 3603                        output_vcf.write_record(variant)
 3604
 3605                    # Log
 3606                    log.debug(f"Annotation done.")
 3607
 3608                    # Close and write file
 3609                    log.info(f"Annotations 'bigwig' write...")
 3610                    output_vcf.close()
 3611                    log.debug(f"Write done.")
 3612
 3613                    # Update variants
 3614                    log.info(f"Annotations 'bigwig' update...")
 3615                    self.update_from_vcf(output_vcf_file)
 3616                    log.debug(f"Update done.")
 3617
 3618        return True
 3619
 3620    def annotation_snpsift(self, threads: int = None) -> None:
 3621        """
 3622        This function annotate with bcftools
 3623
 3624        :param threads: Number of threads to use
 3625        :return: the value of the variable "return_value".
 3626        """
 3627
 3628        # DEBUG
 3629        log.debug("Start annotation with bcftools databases")
 3630
 3631        # Threads
 3632        if not threads:
 3633            threads = self.get_threads()
 3634        log.debug("Threads: " + str(threads))
 3635
 3636        # Config
 3637        config = self.get_config()
 3638        log.debug("Config: " + str(config))
 3639
 3640        # Config - snpSift
 3641        snpsift_bin_command = get_bin_command(
 3642            bin="SnpSift.jar",
 3643            tool="snpsift",
 3644            bin_type="jar",
 3645            config=config,
 3646            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3647        )
 3648        if not snpsift_bin_command:
 3649            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3650            log.error(msg_err)
 3651            raise ValueError(msg_err)
 3652
 3653        # Config - bcftools
 3654        bcftools_bin_command = get_bin_command(
 3655            bin="bcftools",
 3656            tool="bcftools",
 3657            bin_type="bin",
 3658            config=config,
 3659            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3660        )
 3661        if not bcftools_bin_command:
 3662            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3663            log.error(msg_err)
 3664            raise ValueError(msg_err)
 3665
 3666        # Config - BCFTools databases folders
 3667        databases_folders = set(
 3668            self.get_config()
 3669            .get("folders", {})
 3670            .get("databases", {})
 3671            .get("annotations", ["."])
 3672            + self.get_config()
 3673            .get("folders", {})
 3674            .get("databases", {})
 3675            .get("bcftools", ["."])
 3676        )
 3677        log.debug("Databases annotations: " + str(databases_folders))
 3678
 3679        # Param
 3680        annotations = (
 3681            self.get_param()
 3682            .get("annotation", {})
 3683            .get("snpsift", {})
 3684            .get("annotations", None)
 3685        )
 3686        log.debug("Annotations: " + str(annotations))
 3687
 3688        # Assembly
 3689        assembly = self.get_param().get(
 3690            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3691        )
 3692
 3693        # Data
 3694        table_variants = self.get_table_variants()
 3695
 3696        # Check if not empty
 3697        log.debug("Check if not empty")
 3698        sql_query_chromosomes = (
 3699            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3700        )
 3701        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3702        if not sql_query_chromosomes_df["count"][0]:
 3703            log.info(f"VCF empty")
 3704            return
 3705
 3706        # VCF header
 3707        vcf_reader = self.get_header()
 3708        log.debug("Initial header: " + str(vcf_reader.infos))
 3709
 3710        # Existing annotations
 3711        for vcf_annotation in self.get_header().infos:
 3712
 3713            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3714            log.debug(
 3715                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3716            )
 3717
 3718        if annotations:
 3719
 3720            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3721
 3722                # Export VCF file
 3723                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3724
 3725                # Init
 3726                commands = {}
 3727
 3728                for annotation in annotations:
 3729                    annotation_fields = annotations[annotation]
 3730
 3731                    # Annotation Name
 3732                    annotation_name = os.path.basename(annotation)
 3733
 3734                    if not annotation_fields:
 3735                        annotation_fields = {"INFO": None}
 3736
 3737                    log.debug(f"Annotation '{annotation_name}'")
 3738                    log.debug(
 3739                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3740                    )
 3741
 3742                    # Create Database
 3743                    database = Database(
 3744                        database=annotation,
 3745                        databases_folders=databases_folders,
 3746                        assembly=assembly,
 3747                    )
 3748
 3749                    # Find files
 3750                    db_file = database.get_database()
 3751                    db_file = full_path(db_file)
 3752                    db_hdr_file = database.get_header_file()
 3753                    db_hdr_file = full_path(db_hdr_file)
 3754                    db_file_type = database.get_format()
 3755                    db_tbi_file = f"{db_file}.tbi"
 3756                    db_file_compressed = database.is_compressed()
 3757
 3758                    # Check if compressed
 3759                    if not db_file_compressed:
 3760                        log.error(
 3761                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3762                        )
 3763                        raise ValueError(
 3764                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3765                        )
 3766
 3767                    # Check if indexed
 3768                    if not os.path.exists(db_tbi_file):
 3769                        log.error(
 3770                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3771                        )
 3772                        raise ValueError(
 3773                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3774                        )
 3775
 3776                    # Check index - try to create if not exists
 3777                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3778                        log.error("Annotation failed: database not valid")
 3779                        log.error(f"Annotation annotation file: {db_file}")
 3780                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3781                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3782                        raise ValueError(
 3783                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3784                        )
 3785                    else:
 3786
 3787                        log.debug(
 3788                            f"Annotation '{annotation}' - file: "
 3789                            + str(db_file)
 3790                            + " and "
 3791                            + str(db_hdr_file)
 3792                        )
 3793
 3794                        # Load header as VCF object
 3795                        db_hdr_vcf = Variants(input=db_hdr_file)
 3796                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3797                        log.debug(
 3798                            "Annotation database header: "
 3799                            + str(db_hdr_vcf_header_infos)
 3800                        )
 3801
 3802                        # For all fields in database
 3803                        annotation_fields_full = False
 3804                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3805                            annotation_fields = {
 3806                                key: key for key in db_hdr_vcf_header_infos
 3807                            }
 3808                            log.debug(
 3809                                "Annotation database header - All annotations added: "
 3810                                + str(annotation_fields)
 3811                            )
 3812                            annotation_fields_full = True
 3813
 3814                        # # Create file for field rename
 3815                        # log.debug("Create file for field rename")
 3816                        # tmp_rename = NamedTemporaryFile(
 3817                        #     prefix=self.get_prefix(),
 3818                        #     dir=self.get_tmp_dir(),
 3819                        #     suffix=".rename",
 3820                        #     delete=False,
 3821                        # )
 3822                        # tmp_rename_name = tmp_rename.name
 3823                        # tmp_files.append(tmp_rename_name)
 3824
 3825                        # Number of fields
 3826                        nb_annotation_field = 0
 3827                        annotation_list = []
 3828                        annotation_infos_rename_list = []
 3829
 3830                        for annotation_field in annotation_fields:
 3831
 3832                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3833                            annotation_fields_new_name = annotation_fields.get(
 3834                                annotation_field, annotation_field
 3835                            )
 3836                            if not annotation_fields_new_name:
 3837                                annotation_fields_new_name = annotation_field
 3838
 3839                            # Check if field is in DB and if field is not elready in input data
 3840                            if (
 3841                                annotation_field in db_hdr_vcf.get_header().infos
 3842                                and annotation_fields_new_name
 3843                                not in self.get_header().infos
 3844                            ):
 3845
 3846                                log.info(
 3847                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3848                                )
 3849
 3850                                # BCFTools annotate param to rename fields
 3851                                if annotation_field != annotation_fields_new_name:
 3852                                    annotation_infos_rename_list.append(
 3853                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3854                                    )
 3855
 3856                                # Add INFO field to header
 3857                                db_hdr_vcf_header_infos_number = (
 3858                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3859                                )
 3860                                db_hdr_vcf_header_infos_type = (
 3861                                    db_hdr_vcf_header_infos[annotation_field].type
 3862                                    or "String"
 3863                                )
 3864                                db_hdr_vcf_header_infos_description = (
 3865                                    db_hdr_vcf_header_infos[annotation_field].desc
 3866                                    or f"{annotation_field} description"
 3867                                )
 3868                                db_hdr_vcf_header_infos_source = (
 3869                                    db_hdr_vcf_header_infos[annotation_field].source
 3870                                    or "unknown"
 3871                                )
 3872                                db_hdr_vcf_header_infos_version = (
 3873                                    db_hdr_vcf_header_infos[annotation_field].version
 3874                                    or "unknown"
 3875                                )
 3876
 3877                                vcf_reader.infos[annotation_fields_new_name] = (
 3878                                    vcf.parser._Info(
 3879                                        annotation_fields_new_name,
 3880                                        db_hdr_vcf_header_infos_number,
 3881                                        db_hdr_vcf_header_infos_type,
 3882                                        db_hdr_vcf_header_infos_description,
 3883                                        db_hdr_vcf_header_infos_source,
 3884                                        db_hdr_vcf_header_infos_version,
 3885                                        self.code_type_map[
 3886                                            db_hdr_vcf_header_infos_type
 3887                                        ],
 3888                                    )
 3889                                )
 3890
 3891                                annotation_list.append(annotation_field)
 3892
 3893                                nb_annotation_field += 1
 3894
 3895                            else:
 3896
 3897                                if (
 3898                                    annotation_field
 3899                                    not in db_hdr_vcf.get_header().infos
 3900                                ):
 3901                                    log.warning(
 3902                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3903                                    )
 3904                                if (
 3905                                    annotation_fields_new_name
 3906                                    in self.get_header().infos
 3907                                ):
 3908                                    log.warning(
 3909                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3910                                    )
 3911
 3912                        log.info(
 3913                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3914                        )
 3915
 3916                        annotation_infos = ",".join(annotation_list)
 3917
 3918                        if annotation_infos != "":
 3919
 3920                            # Annotated VCF (and error file)
 3921                            tmp_annotation_vcf_name = os.path.join(
 3922                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3923                            )
 3924                            tmp_annotation_vcf_name_err = (
 3925                                tmp_annotation_vcf_name + ".err"
 3926                            )
 3927
 3928                            # Add fields to annotate
 3929                            if not annotation_fields_full:
 3930                                annotation_infos_option = f"-info {annotation_infos}"
 3931                            else:
 3932                                annotation_infos_option = ""
 3933
 3934                            # Info fields rename
 3935                            if annotation_infos_rename_list:
 3936                                annotation_infos_rename = " -c " + ",".join(
 3937                                    annotation_infos_rename_list
 3938                                )
 3939                            else:
 3940                                annotation_infos_rename = ""
 3941
 3942                            # Annotate command
 3943                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3944
 3945                            # Add command
 3946                            commands[command_annotate] = tmp_annotation_vcf_name
 3947
 3948                if commands:
 3949
 3950                    # Export VCF file
 3951                    self.export_variant_vcf(
 3952                        vcf_file=tmp_vcf_name,
 3953                        remove_info=True,
 3954                        add_samples=False,
 3955                        index=True,
 3956                    )
 3957                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3958
 3959                    # Num command
 3960                    nb_command = 0
 3961
 3962                    # Annotate
 3963                    for command_annotate in commands:
 3964                        nb_command += 1
 3965                        log.info(
 3966                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3967                        )
 3968                        log.debug(f"command_annotate={command_annotate}")
 3969                        run_parallel_commands([command_annotate], threads)
 3970
 3971                        # Debug
 3972                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3973
 3974                        # Update variants
 3975                        log.info(
 3976                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3977                        )
 3978                        self.update_from_vcf(commands[command_annotate])
 3979
 3980    def annotation_bcftools(self, threads: int = None) -> None:
 3981        """
 3982        This function annotate with bcftools
 3983
 3984        :param threads: Number of threads to use
 3985        :return: the value of the variable "return_value".
 3986        """
 3987
 3988        # DEBUG
 3989        log.debug("Start annotation with bcftools databases")
 3990
 3991        # Threads
 3992        if not threads:
 3993            threads = self.get_threads()
 3994        log.debug("Threads: " + str(threads))
 3995
 3996        # Config
 3997        config = self.get_config()
 3998        log.debug("Config: " + str(config))
 3999
 4000        # DEBUG
 4001        delete_tmp = True
 4002        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4003            delete_tmp = False
 4004            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4005
 4006        # Config - BCFTools bin command
 4007        bcftools_bin_command = get_bin_command(
 4008            bin="bcftools",
 4009            tool="bcftools",
 4010            bin_type="bin",
 4011            config=config,
 4012            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4013        )
 4014        if not bcftools_bin_command:
 4015            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4016            log.error(msg_err)
 4017            raise ValueError(msg_err)
 4018
 4019        # Config - BCFTools databases folders
 4020        databases_folders = set(
 4021            self.get_config()
 4022            .get("folders", {})
 4023            .get("databases", {})
 4024            .get("annotations", ["."])
 4025            + self.get_config()
 4026            .get("folders", {})
 4027            .get("databases", {})
 4028            .get("bcftools", ["."])
 4029        )
 4030        log.debug("Databases annotations: " + str(databases_folders))
 4031
 4032        # Param
 4033        annotations = (
 4034            self.get_param()
 4035            .get("annotation", {})
 4036            .get("bcftools", {})
 4037            .get("annotations", None)
 4038        )
 4039        log.debug("Annotations: " + str(annotations))
 4040
 4041        # Assembly
 4042        assembly = self.get_param().get(
 4043            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4044        )
 4045
 4046        # Data
 4047        table_variants = self.get_table_variants()
 4048
 4049        # Check if not empty
 4050        log.debug("Check if not empty")
 4051        sql_query_chromosomes = (
 4052            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4053        )
 4054        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4055        if not sql_query_chromosomes_df["count"][0]:
 4056            log.info(f"VCF empty")
 4057            return
 4058
 4059        # Export in VCF
 4060        log.debug("Create initial file to annotate")
 4061        tmp_vcf = NamedTemporaryFile(
 4062            prefix=self.get_prefix(),
 4063            dir=self.get_tmp_dir(),
 4064            suffix=".vcf.gz",
 4065            delete=False,
 4066        )
 4067        tmp_vcf_name = tmp_vcf.name
 4068
 4069        # VCF header
 4070        vcf_reader = self.get_header()
 4071        log.debug("Initial header: " + str(vcf_reader.infos))
 4072
 4073        # Existing annotations
 4074        for vcf_annotation in self.get_header().infos:
 4075
 4076            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4077            log.debug(
 4078                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4079            )
 4080
 4081        if annotations:
 4082
 4083            tmp_ann_vcf_list = []
 4084            commands = []
 4085            tmp_files = []
 4086            err_files = []
 4087
 4088            for annotation in annotations:
 4089                annotation_fields = annotations[annotation]
 4090
 4091                # Annotation Name
 4092                annotation_name = os.path.basename(annotation)
 4093
 4094                if not annotation_fields:
 4095                    annotation_fields = {"INFO": None}
 4096
 4097                log.debug(f"Annotation '{annotation_name}'")
 4098                log.debug(
 4099                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4100                )
 4101
 4102                # Create Database
 4103                database = Database(
 4104                    database=annotation,
 4105                    databases_folders=databases_folders,
 4106                    assembly=assembly,
 4107                )
 4108
 4109                # Find files
 4110                db_file = database.get_database()
 4111                db_file = full_path(db_file)
 4112                db_hdr_file = database.get_header_file()
 4113                db_hdr_file = full_path(db_hdr_file)
 4114                db_file_type = database.get_format()
 4115                db_tbi_file = f"{db_file}.tbi"
 4116                db_file_compressed = database.is_compressed()
 4117
 4118                # Check if compressed
 4119                if not db_file_compressed:
 4120                    log.error(
 4121                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4122                    )
 4123                    raise ValueError(
 4124                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4125                    )
 4126
 4127                # Check if indexed
 4128                if not os.path.exists(db_tbi_file):
 4129                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4130                    raise ValueError(
 4131                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4132                    )
 4133
 4134                # Check index - try to create if not exists
 4135                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4136                    log.error("Annotation failed: database not valid")
 4137                    log.error(f"Annotation annotation file: {db_file}")
 4138                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4139                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4140                    raise ValueError(
 4141                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4142                    )
 4143                else:
 4144
 4145                    log.debug(
 4146                        f"Annotation '{annotation}' - file: "
 4147                        + str(db_file)
 4148                        + " and "
 4149                        + str(db_hdr_file)
 4150                    )
 4151
 4152                    # Load header as VCF object
 4153                    db_hdr_vcf = Variants(input=db_hdr_file)
 4154                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4155                    log.debug(
 4156                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4157                    )
 4158
 4159                    # For all fields in database
 4160                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4161                        annotation_fields = {
 4162                            key: key for key in db_hdr_vcf_header_infos
 4163                        }
 4164                        log.debug(
 4165                            "Annotation database header - All annotations added: "
 4166                            + str(annotation_fields)
 4167                        )
 4168
 4169                    # Number of fields
 4170                    nb_annotation_field = 0
 4171                    annotation_list = []
 4172
 4173                    for annotation_field in annotation_fields:
 4174
 4175                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4176                        annotation_fields_new_name = annotation_fields.get(
 4177                            annotation_field, annotation_field
 4178                        )
 4179                        if not annotation_fields_new_name:
 4180                            annotation_fields_new_name = annotation_field
 4181
 4182                        # Check if field is in DB and if field is not elready in input data
 4183                        if (
 4184                            annotation_field in db_hdr_vcf.get_header().infos
 4185                            and annotation_fields_new_name
 4186                            not in self.get_header().infos
 4187                        ):
 4188
 4189                            log.info(
 4190                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4191                            )
 4192
 4193                            # Add INFO field to header
 4194                            db_hdr_vcf_header_infos_number = (
 4195                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4196                            )
 4197                            db_hdr_vcf_header_infos_type = (
 4198                                db_hdr_vcf_header_infos[annotation_field].type
 4199                                or "String"
 4200                            )
 4201                            db_hdr_vcf_header_infos_description = (
 4202                                db_hdr_vcf_header_infos[annotation_field].desc
 4203                                or f"{annotation_field} description"
 4204                            )
 4205                            db_hdr_vcf_header_infos_source = (
 4206                                db_hdr_vcf_header_infos[annotation_field].source
 4207                                or "unknown"
 4208                            )
 4209                            db_hdr_vcf_header_infos_version = (
 4210                                db_hdr_vcf_header_infos[annotation_field].version
 4211                                or "unknown"
 4212                            )
 4213
 4214                            vcf_reader.infos[annotation_fields_new_name] = (
 4215                                vcf.parser._Info(
 4216                                    annotation_fields_new_name,
 4217                                    db_hdr_vcf_header_infos_number,
 4218                                    db_hdr_vcf_header_infos_type,
 4219                                    db_hdr_vcf_header_infos_description,
 4220                                    db_hdr_vcf_header_infos_source,
 4221                                    db_hdr_vcf_header_infos_version,
 4222                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4223                                )
 4224                            )
 4225
 4226                            # annotation_list.append(annotation_field)
 4227                            if annotation_field != annotation_fields_new_name:
 4228                                annotation_list.append(
 4229                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4230                                )
 4231                            else:
 4232                                annotation_list.append(annotation_field)
 4233
 4234                            nb_annotation_field += 1
 4235
 4236                        else:
 4237
 4238                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4239                                log.warning(
 4240                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4241                                )
 4242                            if annotation_fields_new_name in self.get_header().infos:
 4243                                log.warning(
 4244                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4245                                )
 4246
 4247                    log.info(
 4248                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4249                    )
 4250
 4251                    annotation_infos = ",".join(annotation_list)
 4252
 4253                    if annotation_infos != "":
 4254
 4255                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4256                        log.debug("Protect Header file - remove #CHROM line if exists")
 4257                        tmp_header_vcf = NamedTemporaryFile(
 4258                            prefix=self.get_prefix(),
 4259                            dir=self.get_tmp_dir(),
 4260                            suffix=".hdr",
 4261                            delete=False,
 4262                        )
 4263                        tmp_header_vcf_name = tmp_header_vcf.name
 4264                        tmp_files.append(tmp_header_vcf_name)
 4265                        # Command
 4266                        if db_hdr_file.endswith(".gz"):
 4267                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4268                        else:
 4269                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4270                        # Run
 4271                        run_parallel_commands([command_extract_header], 1)
 4272
 4273                        # Find chomosomes
 4274                        log.debug("Find chromosomes ")
 4275                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4276                        sql_query_chromosomes_df = self.get_query_to_df(
 4277                            sql_query_chromosomes
 4278                        )
 4279                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4280
 4281                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4282
 4283                        # BED columns in the annotation file
 4284                        if db_file_type in ["bed"]:
 4285                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4286
 4287                        for chrom in chomosomes_list:
 4288
 4289                            # Create BED on initial VCF
 4290                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4291                            tmp_bed = NamedTemporaryFile(
 4292                                prefix=self.get_prefix(),
 4293                                dir=self.get_tmp_dir(),
 4294                                suffix=".bed",
 4295                                delete=False,
 4296                            )
 4297                            tmp_bed_name = tmp_bed.name
 4298                            tmp_files.append(tmp_bed_name)
 4299
 4300                            # Detecte regions
 4301                            log.debug(
 4302                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4303                            )
 4304                            window = 1000000
 4305                            sql_query_intervals_for_bed = f"""
 4306                                SELECT  \"#CHROM\",
 4307                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4308                                        \"POS\"+{window}
 4309                                FROM {table_variants} as table_variants
 4310                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4311                            """
 4312                            regions = self.conn.execute(
 4313                                sql_query_intervals_for_bed
 4314                            ).fetchall()
 4315                            merged_regions = merge_regions(regions)
 4316                            log.debug(
 4317                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4318                            )
 4319
 4320                            header = ["#CHROM", "START", "END"]
 4321                            with open(tmp_bed_name, "w") as f:
 4322                                # Write the header with tab delimiter
 4323                                f.write("\t".join(header) + "\n")
 4324                                for d in merged_regions:
 4325                                    # Write each data row with tab delimiter
 4326                                    f.write("\t".join(map(str, d)) + "\n")
 4327
 4328                            # Tmp files
 4329                            tmp_annotation_vcf = NamedTemporaryFile(
 4330                                prefix=self.get_prefix(),
 4331                                dir=self.get_tmp_dir(),
 4332                                suffix=".vcf.gz",
 4333                                delete=False,
 4334                            )
 4335                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4336                            tmp_files.append(tmp_annotation_vcf_name)
 4337                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4338                            tmp_annotation_vcf_name_err = (
 4339                                tmp_annotation_vcf_name + ".err"
 4340                            )
 4341                            err_files.append(tmp_annotation_vcf_name_err)
 4342
 4343                            # Annotate Command
 4344                            log.debug(
 4345                                f"Annotation '{annotation}' - add bcftools command"
 4346                            )
 4347
 4348                            # Command
 4349                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4350
 4351                            # Add command
 4352                            commands.append(command_annotate)
 4353
 4354            # if some commands
 4355            if commands:
 4356
 4357                # Export VCF file
 4358                self.export_variant_vcf(
 4359                    vcf_file=tmp_vcf_name,
 4360                    remove_info=True,
 4361                    add_samples=False,
 4362                    index=True,
 4363                )
 4364
 4365                # Threads
 4366                # calculate threads for annotated commands
 4367                if commands:
 4368                    threads_bcftools_annotate = round(threads / len(commands))
 4369                else:
 4370                    threads_bcftools_annotate = 1
 4371
 4372                if not threads_bcftools_annotate:
 4373                    threads_bcftools_annotate = 1
 4374
 4375                # Add threads option to bcftools commands
 4376                if threads_bcftools_annotate > 1:
 4377                    commands_threaded = []
 4378                    for command in commands:
 4379                        commands_threaded.append(
 4380                            command.replace(
 4381                                f"{bcftools_bin_command} annotate ",
 4382                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4383                            )
 4384                        )
 4385                    commands = commands_threaded
 4386
 4387                # Command annotation multithreading
 4388                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4389                log.info(
 4390                    f"Annotation - Annotation multithreaded in "
 4391                    + str(len(commands))
 4392                    + " commands"
 4393                )
 4394
 4395                run_parallel_commands(commands, threads)
 4396
 4397                # Merge
 4398                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4399
 4400                if tmp_ann_vcf_list_cmd:
 4401
 4402                    # Tmp file
 4403                    tmp_annotate_vcf = NamedTemporaryFile(
 4404                        prefix=self.get_prefix(),
 4405                        dir=self.get_tmp_dir(),
 4406                        suffix=".vcf.gz",
 4407                        delete=True,
 4408                    )
 4409                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4410                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4411                    err_files.append(tmp_annotate_vcf_name_err)
 4412
 4413                    # Tmp file remove command
 4414                    tmp_files_remove_command = ""
 4415                    if tmp_files:
 4416                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4417
 4418                    # Command merge
 4419                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4420                    log.info(
 4421                        f"Annotation - Annotation merging "
 4422                        + str(len(commands))
 4423                        + " annotated files"
 4424                    )
 4425                    log.debug(f"Annotation - merge command: {merge_command}")
 4426                    run_parallel_commands([merge_command], 1)
 4427
 4428                    # Error messages
 4429                    log.info(f"Error/Warning messages:")
 4430                    error_message_command_all = []
 4431                    error_message_command_warning = []
 4432                    error_message_command_err = []
 4433                    for err_file in err_files:
 4434                        with open(err_file, "r") as f:
 4435                            for line in f:
 4436                                message = line.strip()
 4437                                error_message_command_all.append(message)
 4438                                if line.startswith("[W::"):
 4439                                    error_message_command_warning.append(message)
 4440                                if line.startswith("[E::"):
 4441                                    error_message_command_err.append(
 4442                                        f"{err_file}: " + message
 4443                                    )
 4444                    # log info
 4445                    for message in list(
 4446                        set(error_message_command_err + error_message_command_warning)
 4447                    ):
 4448                        log.info(f"   {message}")
 4449                    # debug info
 4450                    for message in list(set(error_message_command_all)):
 4451                        log.debug(f"   {message}")
 4452                    # failed
 4453                    if len(error_message_command_err):
 4454                        log.error("Annotation failed: Error in commands")
 4455                        raise ValueError("Annotation failed: Error in commands")
 4456
 4457                    # Update variants
 4458                    log.info(f"Annotation - Updating...")
 4459                    self.update_from_vcf(tmp_annotate_vcf_name)
 4460
 4461    def annotation_exomiser(self, threads: int = None) -> None:
 4462        """
 4463        This function annotate with Exomiser
 4464
 4465        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4466        - "analysis" (dict/file):
 4467            Full analysis dictionnary parameters (see Exomiser docs).
 4468            Either a dict, or a file in JSON or YAML format.
 4469            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4470            Default : None
 4471        - "preset" (string):
 4472            Analysis preset (available in config folder).
 4473            Used if no full "analysis" is provided.
 4474            Default: "exome"
 4475        - "phenopacket" (dict/file):
 4476            Samples and phenotipic features parameters (see Exomiser docs).
 4477            Either a dict, or a file in JSON or YAML format.
 4478            Default: None
 4479        - "subject" (dict):
 4480            Sample parameters (see Exomiser docs).
 4481            Example:
 4482                "subject":
 4483                    {
 4484                        "id": "ISDBM322017",
 4485                        "sex": "FEMALE"
 4486                    }
 4487            Default: None
 4488        - "sample" (string):
 4489            Sample name to construct "subject" section:
 4490                "subject":
 4491                    {
 4492                        "id": "<sample>",
 4493                        "sex": "UNKNOWN_SEX"
 4494                    }
 4495            Default: None
 4496        - "phenotypicFeatures" (dict)
 4497            Phenotypic features to construct "subject" section.
 4498            Example:
 4499                "phenotypicFeatures":
 4500                    [
 4501                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4502                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4503                    ]
 4504        - "hpo" (list)
 4505            List of HPO ids as phenotypic features.
 4506            Example:
 4507                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4508            Default: []
 4509        - "outputOptions" (dict):
 4510            Output options (see Exomiser docs).
 4511            Default:
 4512                "output_options" =
 4513                    {
 4514                        "outputContributingVariantsOnly": False,
 4515                        "numGenes": 0,
 4516                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4517                    }
 4518        - "transcript_source" (string):
 4519            Transcript source (either "refseq", "ucsc", "ensembl")
 4520            Default: "refseq"
 4521        - "exomiser_to_info" (boolean):
 4522            Add exomiser TSV file columns as INFO fields in VCF.
 4523            Default: False
 4524        - "release" (string):
 4525            Exomise database release.
 4526            If not exists, database release will be downloaded (take a while).
 4527            Default: None (provided by application.properties configuration file)
 4528        - "exomiser_application_properties" (file):
 4529            Exomiser configuration file (see Exomiser docs).
 4530            Useful to automatically download databases (especially for specific genome databases).
 4531
 4532        Notes:
 4533        - If no sample in parameters, first sample in VCF will be chosen
 4534        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4535
 4536        :param threads: The number of threads to use
 4537        :return: None.
 4538        """
 4539
 4540        # DEBUG
 4541        log.debug("Start annotation with Exomiser databases")
 4542
 4543        # Threads
 4544        if not threads:
 4545            threads = self.get_threads()
 4546        log.debug("Threads: " + str(threads))
 4547
 4548        # Config
 4549        config = self.get_config()
 4550        log.debug("Config: " + str(config))
 4551
 4552        # Config - Folders - Databases
 4553        databases_folders = (
 4554            config.get("folders", {})
 4555            .get("databases", {})
 4556            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4557        )
 4558        databases_folders = full_path(databases_folders)
 4559        if not os.path.exists(databases_folders):
 4560            log.error(f"Databases annotations: {databases_folders} NOT found")
 4561        log.debug("Databases annotations: " + str(databases_folders))
 4562
 4563        # Config - Exomiser
 4564        exomiser_bin_command = get_bin_command(
 4565            bin="exomiser-cli*.jar",
 4566            tool="exomiser",
 4567            bin_type="jar",
 4568            config=config,
 4569            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4570        )
 4571        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4572        if not exomiser_bin_command:
 4573            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4574            log.error(msg_err)
 4575            raise ValueError(msg_err)
 4576
 4577        # Param
 4578        param = self.get_param()
 4579        log.debug("Param: " + str(param))
 4580
 4581        # Param - Exomiser
 4582        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4583        log.debug(f"Param Exomiser: {param_exomiser}")
 4584
 4585        # Param - Assembly
 4586        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4587        log.debug("Assembly: " + str(assembly))
 4588
 4589        # Data
 4590        table_variants = self.get_table_variants()
 4591
 4592        # Check if not empty
 4593        log.debug("Check if not empty")
 4594        sql_query_chromosomes = (
 4595            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4596        )
 4597        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4598            log.info(f"VCF empty")
 4599            return False
 4600
 4601        # VCF header
 4602        vcf_reader = self.get_header()
 4603        log.debug("Initial header: " + str(vcf_reader.infos))
 4604
 4605        # Samples
 4606        samples = self.get_header_sample_list()
 4607        if not samples:
 4608            log.error("No Samples in VCF")
 4609            return False
 4610        log.debug(f"Samples: {samples}")
 4611
 4612        # Memory limit
 4613        memory_limit = self.get_memory("8G")
 4614        log.debug(f"memory_limit: {memory_limit}")
 4615
 4616        # Exomiser java options
 4617        exomiser_java_options = (
 4618            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4619        )
 4620        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4621
 4622        # Download Exomiser (if not exists)
 4623        exomiser_release = param_exomiser.get("release", None)
 4624        exomiser_application_properties = param_exomiser.get(
 4625            "exomiser_application_properties", None
 4626        )
 4627        databases_download_exomiser(
 4628            assemblies=[assembly],
 4629            exomiser_folder=databases_folders,
 4630            exomiser_release=exomiser_release,
 4631            exomiser_phenotype_release=exomiser_release,
 4632            exomiser_application_properties=exomiser_application_properties,
 4633        )
 4634
 4635        # Force annotation
 4636        force_update_annotation = True
 4637
 4638        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4639            log.debug("Start annotation Exomiser")
 4640
 4641            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4642
 4643                # tmp_dir = "/tmp/exomiser"
 4644
 4645                ### ANALYSIS ###
 4646                ################
 4647
 4648                # Create analysis.json through analysis dict
 4649                # either analysis in param or by default
 4650                # depending on preset exome/genome)
 4651
 4652                # Init analysis dict
 4653                param_exomiser_analysis_dict = {}
 4654
 4655                # analysis from param
 4656                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4657                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4658
 4659                # If analysis in param -> load anlaysis json
 4660                if param_exomiser_analysis:
 4661
 4662                    # If param analysis is a file and exists
 4663                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4664                        param_exomiser_analysis
 4665                    ):
 4666                        # Load analysis file into analysis dict (either yaml or json)
 4667                        with open(param_exomiser_analysis) as json_file:
 4668                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4669
 4670                    # If param analysis is a dict
 4671                    elif isinstance(param_exomiser_analysis, dict):
 4672                        # Load analysis dict into analysis dict (either yaml or json)
 4673                        param_exomiser_analysis_dict = param_exomiser_analysis
 4674
 4675                    # Error analysis type
 4676                    else:
 4677                        log.error(f"Analysis type unknown. Check param file.")
 4678                        raise ValueError(f"Analysis type unknown. Check param file.")
 4679
 4680                # Case no input analysis config file/dict
 4681                # Use preset (exome/genome) to open default config file
 4682                if not param_exomiser_analysis_dict:
 4683
 4684                    # default preset
 4685                    default_preset = "exome"
 4686
 4687                    # Get param preset or default preset
 4688                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4689
 4690                    # Try to find if preset is a file
 4691                    if os.path.exists(param_exomiser_preset):
 4692                        # Preset file is provided in full path
 4693                        param_exomiser_analysis_default_config_file = (
 4694                            param_exomiser_preset
 4695                        )
 4696                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4697                    #     # Preset file is provided in full path
 4698                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4699                    elif os.path.exists(
 4700                        os.path.join(folder_config, param_exomiser_preset)
 4701                    ):
 4702                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4703                        param_exomiser_analysis_default_config_file = os.path.join(
 4704                            folder_config, param_exomiser_preset
 4705                        )
 4706                    else:
 4707                        # Construct preset file
 4708                        param_exomiser_analysis_default_config_file = os.path.join(
 4709                            folder_config,
 4710                            f"preset-{param_exomiser_preset}-analysis.json",
 4711                        )
 4712
 4713                    # If preset file exists
 4714                    param_exomiser_analysis_default_config_file = full_path(
 4715                        param_exomiser_analysis_default_config_file
 4716                    )
 4717                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4718                        # Load prest file into analysis dict (either yaml or json)
 4719                        with open(
 4720                            param_exomiser_analysis_default_config_file
 4721                        ) as json_file:
 4722                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4723                                json_file
 4724                            )
 4725
 4726                    # Error preset file
 4727                    else:
 4728                        log.error(
 4729                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4730                        )
 4731                        raise ValueError(
 4732                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4733                        )
 4734
 4735                # If no analysis dict created
 4736                if not param_exomiser_analysis_dict:
 4737                    log.error(f"No analysis config")
 4738                    raise ValueError(f"No analysis config")
 4739
 4740                # Log
 4741                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4742
 4743                ### PHENOPACKET ###
 4744                ###################
 4745
 4746                # If no PhenoPacket in analysis dict -> check in param
 4747                if "phenopacket" not in param_exomiser_analysis_dict:
 4748
 4749                    # If PhenoPacket in param -> load anlaysis json
 4750                    if param_exomiser.get("phenopacket", None):
 4751
 4752                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4753                        param_exomiser_phenopacket = full_path(
 4754                            param_exomiser_phenopacket
 4755                        )
 4756
 4757                        # If param phenopacket is a file and exists
 4758                        if isinstance(
 4759                            param_exomiser_phenopacket, str
 4760                        ) and os.path.exists(param_exomiser_phenopacket):
 4761                            # Load phenopacket file into analysis dict (either yaml or json)
 4762                            with open(param_exomiser_phenopacket) as json_file:
 4763                                param_exomiser_analysis_dict["phenopacket"] = (
 4764                                    yaml.safe_load(json_file)
 4765                                )
 4766
 4767                        # If param phenopacket is a dict
 4768                        elif isinstance(param_exomiser_phenopacket, dict):
 4769                            # Load phenopacket dict into analysis dict (either yaml or json)
 4770                            param_exomiser_analysis_dict["phenopacket"] = (
 4771                                param_exomiser_phenopacket
 4772                            )
 4773
 4774                        # Error phenopacket type
 4775                        else:
 4776                            log.error(f"Phenopacket type unknown. Check param file.")
 4777                            raise ValueError(
 4778                                f"Phenopacket type unknown. Check param file."
 4779                            )
 4780
 4781                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4782                if "phenopacket" not in param_exomiser_analysis_dict:
 4783
 4784                    # Init PhenoPacket
 4785                    param_exomiser_analysis_dict["phenopacket"] = {
 4786                        "id": "analysis",
 4787                        "proband": {},
 4788                    }
 4789
 4790                    ### Add subject ###
 4791
 4792                    # If subject exists
 4793                    param_exomiser_subject = param_exomiser.get("subject", {})
 4794
 4795                    # If subject not exists -> found sample ID
 4796                    if not param_exomiser_subject:
 4797
 4798                        # Found sample ID in param
 4799                        sample = param_exomiser.get("sample", None)
 4800
 4801                        # Find sample ID (first sample)
 4802                        if not sample:
 4803                            sample_list = self.get_header_sample_list()
 4804                            if len(sample_list) > 0:
 4805                                sample = sample_list[0]
 4806                            else:
 4807                                log.error(f"No sample found")
 4808                                raise ValueError(f"No sample found")
 4809
 4810                        # Create subject
 4811                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4812
 4813                    # Add to dict
 4814                    param_exomiser_analysis_dict["phenopacket"][
 4815                        "subject"
 4816                    ] = param_exomiser_subject
 4817
 4818                    ### Add "phenotypicFeatures" ###
 4819
 4820                    # If phenotypicFeatures exists
 4821                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4822                        "phenotypicFeatures", []
 4823                    )
 4824
 4825                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4826                    if not param_exomiser_phenotypicfeatures:
 4827
 4828                        # Found HPO in param
 4829                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4830
 4831                        # Split HPO if list in string format separated by comma
 4832                        if isinstance(param_exomiser_hpo, str):
 4833                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4834
 4835                        # Create HPO list
 4836                        for hpo in param_exomiser_hpo:
 4837                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4838                            param_exomiser_phenotypicfeatures.append(
 4839                                {
 4840                                    "type": {
 4841                                        "id": f"HP:{hpo_clean}",
 4842                                        "label": f"HP:{hpo_clean}",
 4843                                    }
 4844                                }
 4845                            )
 4846
 4847                    # Add to dict
 4848                    param_exomiser_analysis_dict["phenopacket"][
 4849                        "phenotypicFeatures"
 4850                    ] = param_exomiser_phenotypicfeatures
 4851
 4852                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4853                    if not param_exomiser_phenotypicfeatures:
 4854                        for step in param_exomiser_analysis_dict.get(
 4855                            "analysis", {}
 4856                        ).get("steps", []):
 4857                            if "hiPhivePrioritiser" in step:
 4858                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4859                                    "steps", []
 4860                                ).remove(step)
 4861
 4862                ### Add Input File ###
 4863
 4864                # Initial file name and htsFiles
 4865                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4866                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4867                    {
 4868                        "uri": tmp_vcf_name,
 4869                        "htsFormat": "VCF",
 4870                        "genomeAssembly": assembly,
 4871                    }
 4872                ]
 4873
 4874                ### Add metaData ###
 4875
 4876                # If metaData not in analysis dict
 4877                if "metaData" not in param_exomiser_analysis_dict:
 4878                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4879                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4880                        "createdBy": "howard",
 4881                        "phenopacketSchemaVersion": 1,
 4882                    }
 4883
 4884                ### OutputOptions ###
 4885
 4886                # Init output result folder
 4887                output_results = os.path.join(tmp_dir, "results")
 4888
 4889                # If no outputOptions in analysis dict
 4890                if "outputOptions" not in param_exomiser_analysis_dict:
 4891
 4892                    # default output formats
 4893                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4894
 4895                    # Get outputOptions in param
 4896                    output_options = param_exomiser.get("outputOptions", None)
 4897
 4898                    # If no output_options in param -> check
 4899                    if not output_options:
 4900                        output_options = {
 4901                            "outputContributingVariantsOnly": False,
 4902                            "numGenes": 0,
 4903                            "outputFormats": defaut_output_formats,
 4904                        }
 4905
 4906                    # Replace outputDirectory in output options
 4907                    output_options["outputDirectory"] = output_results
 4908                    output_options["outputFileName"] = "howard"
 4909
 4910                    # Add outputOptions in analysis dict
 4911                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4912
 4913                else:
 4914
 4915                    # Replace output_results and output format (if exists in param)
 4916                    param_exomiser_analysis_dict["outputOptions"][
 4917                        "outputDirectory"
 4918                    ] = output_results
 4919                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4920                        list(
 4921                            set(
 4922                                param_exomiser_analysis_dict.get(
 4923                                    "outputOptions", {}
 4924                                ).get("outputFormats", [])
 4925                                + ["TSV_VARIANT", "VCF"]
 4926                            )
 4927                        )
 4928                    )
 4929
 4930                # log
 4931                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4932
 4933                ### ANALYSIS FILE ###
 4934                #####################
 4935
 4936                ### Full JSON analysis config file ###
 4937
 4938                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4939                with open(exomiser_analysis, "w") as fp:
 4940                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4941
 4942                ### SPLIT analysis and sample config files
 4943
 4944                # Splitted analysis dict
 4945                param_exomiser_analysis_dict_for_split = (
 4946                    param_exomiser_analysis_dict.copy()
 4947                )
 4948
 4949                # Phenopacket JSON file
 4950                exomiser_analysis_phenopacket = os.path.join(
 4951                    tmp_dir, "analysis_phenopacket.json"
 4952                )
 4953                with open(exomiser_analysis_phenopacket, "w") as fp:
 4954                    json.dump(
 4955                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4956                        fp,
 4957                        indent=4,
 4958                    )
 4959
 4960                # Analysis JSON file without Phenopacket parameters
 4961                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4962                exomiser_analysis_analysis = os.path.join(
 4963                    tmp_dir, "analysis_analysis.json"
 4964                )
 4965                with open(exomiser_analysis_analysis, "w") as fp:
 4966                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4967
 4968                ### INITAL VCF file ###
 4969                #######################
 4970
 4971                ### Create list of samples to use and include inti initial VCF file ####
 4972
 4973                # Subject (main sample)
 4974                # Get sample ID in analysis dict
 4975                sample_subject = (
 4976                    param_exomiser_analysis_dict.get("phenopacket", {})
 4977                    .get("subject", {})
 4978                    .get("id", None)
 4979                )
 4980                sample_proband = (
 4981                    param_exomiser_analysis_dict.get("phenopacket", {})
 4982                    .get("proband", {})
 4983                    .get("subject", {})
 4984                    .get("id", None)
 4985                )
 4986                sample = []
 4987                if sample_subject:
 4988                    sample.append(sample_subject)
 4989                if sample_proband:
 4990                    sample.append(sample_proband)
 4991
 4992                # Get sample ID within Pedigree
 4993                pedigree_persons_list = (
 4994                    param_exomiser_analysis_dict.get("phenopacket", {})
 4995                    .get("pedigree", {})
 4996                    .get("persons", {})
 4997                )
 4998
 4999                # Create list with all sample ID in pedigree (if exists)
 5000                pedigree_persons = []
 5001                for person in pedigree_persons_list:
 5002                    pedigree_persons.append(person.get("individualId"))
 5003
 5004                # Concat subject sample ID and samples ID in pedigreesamples
 5005                samples = list(set(sample + pedigree_persons))
 5006
 5007                # Check if sample list is not empty
 5008                if not samples:
 5009                    log.error(f"No samples found")
 5010                    raise ValueError(f"No samples found")
 5011
 5012                # Create VCF with sample (either sample in param or first one by default)
 5013                # Export VCF file
 5014                self.export_variant_vcf(
 5015                    vcf_file=tmp_vcf_name,
 5016                    remove_info=True,
 5017                    add_samples=True,
 5018                    list_samples=samples,
 5019                    index=False,
 5020                )
 5021
 5022                ### Execute Exomiser ###
 5023                ########################
 5024
 5025                # Init command
 5026                exomiser_command = ""
 5027
 5028                # Command exomiser options
 5029                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5030
 5031                # Release
 5032                exomiser_release = param_exomiser.get("release", None)
 5033                if exomiser_release:
 5034                    # phenotype data version
 5035                    exomiser_options += (
 5036                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5037                    )
 5038                    # data version
 5039                    exomiser_options += (
 5040                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5041                    )
 5042                    # variant white list
 5043                    variant_white_list_file = (
 5044                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5045                    )
 5046                    if os.path.exists(
 5047                        os.path.join(
 5048                            databases_folders, assembly, variant_white_list_file
 5049                        )
 5050                    ):
 5051                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5052
 5053                # transcript_source
 5054                transcript_source = param_exomiser.get(
 5055                    "transcript_source", None
 5056                )  # ucsc, refseq, ensembl
 5057                if transcript_source:
 5058                    exomiser_options += (
 5059                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5060                    )
 5061
 5062                # If analysis contain proband param
 5063                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5064                    "proband", {}
 5065                ):
 5066                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5067
 5068                # If no proband (usually uniq sample)
 5069                else:
 5070                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5071
 5072                # Log
 5073                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5074
 5075                # Run command
 5076                result = subprocess.call(
 5077                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5078                )
 5079                if result:
 5080                    log.error("Exomiser command failed")
 5081                    raise ValueError("Exomiser command failed")
 5082
 5083                ### RESULTS ###
 5084                ###############
 5085
 5086                ### Annotate with TSV fields ###
 5087
 5088                # Init result tsv file
 5089                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5090
 5091                # Init result tsv file
 5092                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5093
 5094                # Parse TSV file and explode columns in INFO field
 5095                if exomiser_to_info and os.path.exists(output_results_tsv):
 5096
 5097                    # Log
 5098                    log.debug("Exomiser columns to VCF INFO field")
 5099
 5100                    # Retrieve columns and types
 5101                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5102                    output_results_tsv_df = self.get_query_to_df(query)
 5103                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5104
 5105                    # Init concat fields for update
 5106                    sql_query_update_concat_fields = []
 5107
 5108                    # Fields to avoid
 5109                    fields_to_avoid = [
 5110                        "CONTIG",
 5111                        "START",
 5112                        "END",
 5113                        "REF",
 5114                        "ALT",
 5115                        "QUAL",
 5116                        "FILTER",
 5117                        "GENOTYPE",
 5118                    ]
 5119
 5120                    # List all columns to add into header
 5121                    for header_column in output_results_tsv_columns:
 5122
 5123                        # If header column is enable
 5124                        if header_column not in fields_to_avoid:
 5125
 5126                            # Header info type
 5127                            header_info_type = "String"
 5128                            header_column_df = output_results_tsv_df[header_column]
 5129                            header_column_df_dtype = header_column_df.dtype
 5130                            if header_column_df_dtype == object:
 5131                                if (
 5132                                    pd.to_numeric(header_column_df, errors="coerce")
 5133                                    .notnull()
 5134                                    .all()
 5135                                ):
 5136                                    header_info_type = "Float"
 5137                            else:
 5138                                header_info_type = "Integer"
 5139
 5140                            # Header info
 5141                            characters_to_validate = ["-"]
 5142                            pattern = "[" + "".join(characters_to_validate) + "]"
 5143                            header_info_name = re.sub(
 5144                                pattern,
 5145                                "_",
 5146                                f"Exomiser_{header_column}".replace("#", ""),
 5147                            )
 5148                            header_info_number = "."
 5149                            header_info_description = (
 5150                                f"Exomiser {header_column} annotation"
 5151                            )
 5152                            header_info_source = "Exomiser"
 5153                            header_info_version = "unknown"
 5154                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5155                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5156                                header_info_name,
 5157                                header_info_number,
 5158                                header_info_type,
 5159                                header_info_description,
 5160                                header_info_source,
 5161                                header_info_version,
 5162                                header_info_code,
 5163                            )
 5164
 5165                            # Add field to add for update to concat fields
 5166                            sql_query_update_concat_fields.append(
 5167                                f"""
 5168                                CASE
 5169                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5170                                    THEN concat(
 5171                                        '{header_info_name}=',
 5172                                        table_parquet."{header_column}",
 5173                                        ';'
 5174                                        )
 5175
 5176                                    ELSE ''
 5177                                END
 5178                            """
 5179                            )
 5180
 5181                    # Update query
 5182                    sql_query_update = f"""
 5183                        UPDATE {table_variants} as table_variants
 5184                            SET INFO = concat(
 5185                                            CASE
 5186                                                WHEN INFO NOT IN ('', '.')
 5187                                                THEN INFO
 5188                                                ELSE ''
 5189                                            END,
 5190                                            CASE
 5191                                                WHEN table_variants.INFO NOT IN ('','.')
 5192                                                THEN ';'
 5193                                                ELSE ''
 5194                                            END,
 5195                                            (
 5196                                            SELECT 
 5197                                                concat(
 5198                                                    {",".join(sql_query_update_concat_fields)}
 5199                                                )
 5200                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5201                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5202                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5203                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5204                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5205                                            )
 5206                                        )
 5207                            ;
 5208                        """
 5209
 5210                    # Update
 5211                    self.conn.execute(sql_query_update)
 5212
 5213                ### Annotate with VCF INFO field ###
 5214
 5215                # Init result VCF file
 5216                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5217
 5218                # If VCF exists
 5219                if os.path.exists(output_results_vcf):
 5220
 5221                    # Log
 5222                    log.debug("Exomiser result VCF update variants")
 5223
 5224                    # Find Exomiser INFO field annotation in header
 5225                    with gzip.open(output_results_vcf, "rt") as f:
 5226                        header_list = self.read_vcf_header(f)
 5227                    exomiser_vcf_header = vcf.Reader(
 5228                        io.StringIO("\n".join(header_list))
 5229                    )
 5230
 5231                    # Add annotation INFO field to header
 5232                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5233
 5234                    # Update variants with VCF
 5235                    self.update_from_vcf(output_results_vcf)
 5236
 5237        return True
 5238
 5239    def annotation_snpeff(self, threads: int = None) -> None:
 5240        """
 5241        This function annotate with snpEff
 5242
 5243        :param threads: The number of threads to use
 5244        :return: the value of the variable "return_value".
 5245        """
 5246
 5247        # DEBUG
 5248        log.debug("Start annotation with snpeff databases")
 5249
 5250        # Threads
 5251        if not threads:
 5252            threads = self.get_threads()
 5253        log.debug("Threads: " + str(threads))
 5254
 5255        # DEBUG
 5256        delete_tmp = True
 5257        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5258            delete_tmp = False
 5259            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5260
 5261        # Config
 5262        config = self.get_config()
 5263        log.debug("Config: " + str(config))
 5264
 5265        # Config - Folders - Databases
 5266        databases_folders = (
 5267            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5268        )
 5269        log.debug("Databases annotations: " + str(databases_folders))
 5270
 5271        # Config - snpEff bin command
 5272        snpeff_bin_command = get_bin_command(
 5273            bin="snpEff.jar",
 5274            tool="snpeff",
 5275            bin_type="jar",
 5276            config=config,
 5277            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5278        )
 5279        if not snpeff_bin_command:
 5280            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5281            log.error(msg_err)
 5282            raise ValueError(msg_err)
 5283
 5284        # Config - snpEff databases
 5285        snpeff_databases = (
 5286            config.get("folders", {})
 5287            .get("databases", {})
 5288            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5289        )
 5290        snpeff_databases = full_path(snpeff_databases)
 5291        if snpeff_databases is not None and snpeff_databases != "":
 5292            log.debug(f"Create snpEff databases folder")
 5293            if not os.path.exists(snpeff_databases):
 5294                os.makedirs(snpeff_databases)
 5295
 5296        # Param
 5297        param = self.get_param()
 5298        log.debug("Param: " + str(param))
 5299
 5300        # Param
 5301        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5302        log.debug("Options: " + str(options))
 5303
 5304        # Param - Assembly
 5305        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5306
 5307        # Param - Options
 5308        snpeff_options = (
 5309            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5310        )
 5311        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5312        snpeff_csvstats = (
 5313            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5314        )
 5315        if snpeff_stats:
 5316            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5317            snpeff_stats = full_path(snpeff_stats)
 5318            snpeff_options += f" -stats {snpeff_stats}"
 5319        if snpeff_csvstats:
 5320            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5321            snpeff_csvstats = full_path(snpeff_csvstats)
 5322            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5323
 5324        # Data
 5325        table_variants = self.get_table_variants()
 5326
 5327        # Check if not empty
 5328        log.debug("Check if not empty")
 5329        sql_query_chromosomes = (
 5330            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5331        )
 5332        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5333        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5334            log.info(f"VCF empty")
 5335            return
 5336
 5337        # Export in VCF
 5338        log.debug("Create initial file to annotate")
 5339        tmp_vcf = NamedTemporaryFile(
 5340            prefix=self.get_prefix(),
 5341            dir=self.get_tmp_dir(),
 5342            suffix=".vcf.gz",
 5343            delete=True,
 5344        )
 5345        tmp_vcf_name = tmp_vcf.name
 5346
 5347        # VCF header
 5348        vcf_reader = self.get_header()
 5349        log.debug("Initial header: " + str(vcf_reader.infos))
 5350
 5351        # Existing annotations
 5352        for vcf_annotation in self.get_header().infos:
 5353
 5354            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5355            log.debug(
 5356                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5357            )
 5358
 5359        # Memory limit
 5360        # if config.get("memory", None):
 5361        #     memory_limit = config.get("memory", "8G")
 5362        # else:
 5363        #     memory_limit = "8G"
 5364        memory_limit = self.get_memory("8G")
 5365        log.debug(f"memory_limit: {memory_limit}")
 5366
 5367        # snpEff java options
 5368        snpeff_java_options = (
 5369            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5370        )
 5371        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5372
 5373        force_update_annotation = True
 5374
 5375        if "ANN" not in self.get_header().infos or force_update_annotation:
 5376
 5377            # Check snpEff database
 5378            log.debug(f"Check snpEff databases {[assembly]}")
 5379            databases_download_snpeff(
 5380                folder=snpeff_databases, assemblies=[assembly], config=config
 5381            )
 5382
 5383            # Export VCF file
 5384            self.export_variant_vcf(
 5385                vcf_file=tmp_vcf_name,
 5386                remove_info=True,
 5387                add_samples=False,
 5388                index=True,
 5389            )
 5390
 5391            # Tmp file
 5392            err_files = []
 5393            tmp_annotate_vcf = NamedTemporaryFile(
 5394                prefix=self.get_prefix(),
 5395                dir=self.get_tmp_dir(),
 5396                suffix=".vcf",
 5397                delete=False,
 5398            )
 5399            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5400            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5401            err_files.append(tmp_annotate_vcf_name_err)
 5402
 5403            # Command
 5404            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5405            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5406            run_parallel_commands([snpeff_command], 1)
 5407
 5408            # Error messages
 5409            log.info(f"Error/Warning messages:")
 5410            error_message_command_all = []
 5411            error_message_command_warning = []
 5412            error_message_command_err = []
 5413            for err_file in err_files:
 5414                with open(err_file, "r") as f:
 5415                    for line in f:
 5416                        message = line.strip()
 5417                        error_message_command_all.append(message)
 5418                        if line.startswith("[W::"):
 5419                            error_message_command_warning.append(message)
 5420                        if line.startswith("[E::"):
 5421                            error_message_command_err.append(f"{err_file}: " + message)
 5422            # log info
 5423            for message in list(
 5424                set(error_message_command_err + error_message_command_warning)
 5425            ):
 5426                log.info(f"   {message}")
 5427            # debug info
 5428            for message in list(set(error_message_command_all)):
 5429                log.debug(f"   {message}")
 5430            # failed
 5431            if len(error_message_command_err):
 5432                log.error("Annotation failed: Error in commands")
 5433                raise ValueError("Annotation failed: Error in commands")
 5434
 5435            # Find annotation in header
 5436            with open(tmp_annotate_vcf_name, "rt") as f:
 5437                header_list = self.read_vcf_header(f)
 5438            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5439
 5440            for ann in annovar_vcf_header.infos:
 5441                if ann not in self.get_header().infos:
 5442                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5443
 5444            # Update variants
 5445            log.info(f"Annotation - Updating...")
 5446            self.update_from_vcf(tmp_annotate_vcf_name)
 5447
 5448        else:
 5449            if "ANN" in self.get_header().infos:
 5450                log.debug(f"Existing snpEff annotations in VCF")
 5451            if force_update_annotation:
 5452                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5453
 5454    def annotation_annovar(self, threads: int = None) -> None:
 5455        """
 5456        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5457        annotations
 5458
 5459        :param threads: number of threads to use
 5460        :return: the value of the variable "return_value".
 5461        """
 5462
 5463        # DEBUG
 5464        log.debug("Start annotation with Annovar databases")
 5465
 5466        # Threads
 5467        if not threads:
 5468            threads = self.get_threads()
 5469        log.debug("Threads: " + str(threads))
 5470
 5471        # Tmp en Err files
 5472        tmp_files = []
 5473        err_files = []
 5474
 5475        # DEBUG
 5476        delete_tmp = True
 5477        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5478            delete_tmp = False
 5479            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5480
 5481        # Config
 5482        config = self.get_config()
 5483        log.debug("Config: " + str(config))
 5484
 5485        # Config - Folders - Databases
 5486        databases_folders = (
 5487            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5488        )
 5489        log.debug("Databases annotations: " + str(databases_folders))
 5490
 5491        # Config - annovar bin command
 5492        annovar_bin_command = get_bin_command(
 5493            bin="table_annovar.pl",
 5494            tool="annovar",
 5495            bin_type="perl",
 5496            config=config,
 5497            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5498        )
 5499        if not annovar_bin_command:
 5500            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5501            log.error(msg_err)
 5502            raise ValueError(msg_err)
 5503
 5504        # Config - BCFTools bin command
 5505        bcftools_bin_command = get_bin_command(
 5506            bin="bcftools",
 5507            tool="bcftools",
 5508            bin_type="bin",
 5509            config=config,
 5510            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5511        )
 5512        if not bcftools_bin_command:
 5513            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5514            log.error(msg_err)
 5515            raise ValueError(msg_err)
 5516
 5517        # Config - annovar databases
 5518        annovar_databases = (
 5519            config.get("folders", {})
 5520            .get("databases", {})
 5521            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5522        )
 5523        if annovar_databases is not None:
 5524            if isinstance(annovar_databases, list):
 5525                annovar_databases = full_path(annovar_databases[0])
 5526                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5527            annovar_databases = full_path(annovar_databases)
 5528            if not os.path.exists(annovar_databases):
 5529                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5530                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5531        else:
 5532            msg_err = f"Annovar databases configuration failed"
 5533            log.error(msg_err)
 5534            raise ValueError(msg_err)
 5535
 5536        # Param
 5537        param = self.get_param()
 5538        log.debug("Param: " + str(param))
 5539
 5540        # Param - options
 5541        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5542        log.debug("Options: " + str(options))
 5543
 5544        # Param - annotations
 5545        annotations = (
 5546            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5547        )
 5548        log.debug("Annotations: " + str(annotations))
 5549
 5550        # Param - Assembly
 5551        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5552
 5553        # Annovar database assembly
 5554        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5555        if annovar_databases_assembly != "" and not os.path.exists(
 5556            annovar_databases_assembly
 5557        ):
 5558            os.makedirs(annovar_databases_assembly)
 5559
 5560        # Data
 5561        table_variants = self.get_table_variants()
 5562
 5563        # Check if not empty
 5564        log.debug("Check if not empty")
 5565        sql_query_chromosomes = (
 5566            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5567        )
 5568        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5569        if not sql_query_chromosomes_df["count"][0]:
 5570            log.info(f"VCF empty")
 5571            return
 5572
 5573        # VCF header
 5574        vcf_reader = self.get_header()
 5575        log.debug("Initial header: " + str(vcf_reader.infos))
 5576
 5577        # Existing annotations
 5578        for vcf_annotation in self.get_header().infos:
 5579
 5580            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5581            log.debug(
 5582                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5583            )
 5584
 5585        force_update_annotation = True
 5586
 5587        if annotations:
 5588
 5589            commands = []
 5590            tmp_annotates_vcf_name_list = []
 5591
 5592            # Export in VCF
 5593            log.debug("Create initial file to annotate")
 5594            tmp_vcf = NamedTemporaryFile(
 5595                prefix=self.get_prefix(),
 5596                dir=self.get_tmp_dir(),
 5597                suffix=".vcf.gz",
 5598                delete=False,
 5599            )
 5600            tmp_vcf_name = tmp_vcf.name
 5601            tmp_files.append(tmp_vcf_name)
 5602            tmp_files.append(tmp_vcf_name + ".tbi")
 5603
 5604            # Export VCF file
 5605            self.export_variant_vcf(
 5606                vcf_file=tmp_vcf_name,
 5607                remove_info=".",
 5608                add_samples=False,
 5609                index=True,
 5610            )
 5611
 5612            # Create file for field rename
 5613            log.debug("Create file for field rename")
 5614            tmp_rename = NamedTemporaryFile(
 5615                prefix=self.get_prefix(),
 5616                dir=self.get_tmp_dir(),
 5617                suffix=".rename",
 5618                delete=False,
 5619            )
 5620            tmp_rename_name = tmp_rename.name
 5621            tmp_files.append(tmp_rename_name)
 5622
 5623            # Check Annovar database
 5624            log.debug(
 5625                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5626            )
 5627            databases_download_annovar(
 5628                folder=annovar_databases,
 5629                files=list(annotations.keys()),
 5630                assemblies=[assembly],
 5631            )
 5632
 5633            for annotation in annotations:
 5634                annotation_fields = annotations[annotation]
 5635
 5636                if not annotation_fields:
 5637                    annotation_fields = {"INFO": None}
 5638
 5639                log.info(f"Annotations Annovar - database '{annotation}'")
 5640                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5641
 5642                # Tmp file for annovar
 5643                err_files = []
 5644                tmp_annotate_vcf_directory = TemporaryDirectory(
 5645                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5646                )
 5647                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5648                tmp_annotate_vcf_name_annovar = (
 5649                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5650                )
 5651                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5652                err_files.append(tmp_annotate_vcf_name_err)
 5653                tmp_files.append(tmp_annotate_vcf_name_err)
 5654
 5655                # Tmp file final vcf annotated by annovar
 5656                tmp_annotate_vcf = NamedTemporaryFile(
 5657                    prefix=self.get_prefix(),
 5658                    dir=self.get_tmp_dir(),
 5659                    suffix=".vcf.gz",
 5660                    delete=False,
 5661                )
 5662                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5663                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5664                tmp_files.append(tmp_annotate_vcf_name)
 5665                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5666
 5667                # Number of fields
 5668                annotation_list = []
 5669                annotation_renamed_list = []
 5670
 5671                for annotation_field in annotation_fields:
 5672
 5673                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5674                    annotation_fields_new_name = annotation_fields.get(
 5675                        annotation_field, annotation_field
 5676                    )
 5677                    if not annotation_fields_new_name:
 5678                        annotation_fields_new_name = annotation_field
 5679
 5680                    if (
 5681                        force_update_annotation
 5682                        or annotation_fields_new_name not in self.get_header().infos
 5683                    ):
 5684                        annotation_list.append(annotation_field)
 5685                        annotation_renamed_list.append(annotation_fields_new_name)
 5686                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5687                        log.warning(
 5688                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5689                        )
 5690
 5691                    # Add rename info
 5692                    run_parallel_commands(
 5693                        [
 5694                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5695                        ],
 5696                        1,
 5697                    )
 5698
 5699                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5700                log.debug("annotation_list: " + str(annotation_list))
 5701
 5702                # protocol
 5703                protocol = annotation
 5704
 5705                # argument
 5706                argument = ""
 5707
 5708                # operation
 5709                operation = "f"
 5710                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5711                    "ensGene"
 5712                ):
 5713                    operation = "g"
 5714                    if options.get("genebase", None):
 5715                        argument = f"""'{options.get("genebase","")}'"""
 5716                elif annotation in ["cytoBand"]:
 5717                    operation = "r"
 5718
 5719                # argument option
 5720                argument_option = ""
 5721                if argument != "":
 5722                    argument_option = " --argument " + argument
 5723
 5724                # command options
 5725                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5726                for option in options:
 5727                    if option not in ["genebase"]:
 5728                        command_options += f""" --{option}={options[option]}"""
 5729
 5730                # Command
 5731
 5732                # Command - Annovar
 5733                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5734                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5735
 5736                # Command - start pipe
 5737                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5738
 5739                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5740                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5741
 5742                # Command - Special characters (refGene annotation)
 5743                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5744
 5745                # Command - Clean empty fields (with value ".")
 5746                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5747
 5748                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5749                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5750                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5751                    # for ann in annotation_renamed_list:
 5752                    for ann in annotation_list:
 5753                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5754
 5755                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5756
 5757                # Command - indexing
 5758                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5759
 5760                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5761                run_parallel_commands([command_annovar], 1)
 5762
 5763                # Error messages
 5764                log.info(f"Error/Warning messages:")
 5765                error_message_command_all = []
 5766                error_message_command_warning = []
 5767                error_message_command_err = []
 5768                for err_file in err_files:
 5769                    with open(err_file, "r") as f:
 5770                        for line in f:
 5771                            message = line.strip()
 5772                            error_message_command_all.append(message)
 5773                            if line.startswith("[W::") or line.startswith("WARNING"):
 5774                                error_message_command_warning.append(message)
 5775                            if line.startswith("[E::") or line.startswith("ERROR"):
 5776                                error_message_command_err.append(
 5777                                    f"{err_file}: " + message
 5778                                )
 5779                # log info
 5780                for message in list(
 5781                    set(error_message_command_err + error_message_command_warning)
 5782                ):
 5783                    log.info(f"   {message}")
 5784                # debug info
 5785                for message in list(set(error_message_command_all)):
 5786                    log.debug(f"   {message}")
 5787                # failed
 5788                if len(error_message_command_err):
 5789                    log.error("Annotation failed: Error in commands")
 5790                    raise ValueError("Annotation failed: Error in commands")
 5791
 5792            if tmp_annotates_vcf_name_list:
 5793
 5794                # List of annotated files
 5795                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5796
 5797                # Tmp file
 5798                tmp_annotate_vcf = NamedTemporaryFile(
 5799                    prefix=self.get_prefix(),
 5800                    dir=self.get_tmp_dir(),
 5801                    suffix=".vcf.gz",
 5802                    delete=False,
 5803                )
 5804                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5805                tmp_files.append(tmp_annotate_vcf_name)
 5806                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5807                err_files.append(tmp_annotate_vcf_name_err)
 5808                tmp_files.append(tmp_annotate_vcf_name_err)
 5809
 5810                # Command merge
 5811                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5812                log.info(
 5813                    f"Annotation Annovar - Annotation merging "
 5814                    + str(len(tmp_annotates_vcf_name_list))
 5815                    + " annotated files"
 5816                )
 5817                log.debug(f"Annotation - merge command: {merge_command}")
 5818                run_parallel_commands([merge_command], 1)
 5819
 5820                # Find annotation in header
 5821                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5822                    header_list = self.read_vcf_header(f)
 5823                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5824
 5825                for ann in annovar_vcf_header.infos:
 5826                    if ann not in self.get_header().infos:
 5827                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5828
 5829                # Update variants
 5830                log.info(f"Annotation Annovar - Updating...")
 5831                self.update_from_vcf(tmp_annotate_vcf_name)
 5832
 5833            # Clean files
 5834            # Tmp file remove command
 5835            if True:
 5836                tmp_files_remove_command = ""
 5837                if tmp_files:
 5838                    tmp_files_remove_command = " ".join(tmp_files)
 5839                clean_command = f" rm -f {tmp_files_remove_command} "
 5840                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5841                log.debug(f"Annotation - cleaning command: {clean_command}")
 5842                run_parallel_commands([clean_command], 1)
 5843
 5844    # Parquet
 5845    def annotation_parquet(self, threads: int = None) -> None:
 5846        """
 5847        It takes a VCF file, and annotates it with a parquet file
 5848
 5849        :param threads: number of threads to use for the annotation
 5850        :return: the value of the variable "result".
 5851        """
 5852
 5853        # DEBUG
 5854        log.debug("Start annotation with parquet databases")
 5855
 5856        # Threads
 5857        if not threads:
 5858            threads = self.get_threads()
 5859        log.debug("Threads: " + str(threads))
 5860
 5861        # DEBUG
 5862        delete_tmp = True
 5863        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5864            delete_tmp = False
 5865            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5866
 5867        # Config
 5868        databases_folders = set(
 5869            self.get_config()
 5870            .get("folders", {})
 5871            .get("databases", {})
 5872            .get("annotations", ["."])
 5873            + self.get_config()
 5874            .get("folders", {})
 5875            .get("databases", {})
 5876            .get("parquet", ["."])
 5877        )
 5878        log.debug("Databases annotations: " + str(databases_folders))
 5879
 5880        # Param
 5881        annotations = (
 5882            self.get_param()
 5883            .get("annotation", {})
 5884            .get("parquet", {})
 5885            .get("annotations", None)
 5886        )
 5887        log.debug("Annotations: " + str(annotations))
 5888
 5889        # Assembly
 5890        assembly = self.get_param().get(
 5891            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5892        )
 5893
 5894        # Force Update Annotation
 5895        force_update_annotation = (
 5896            self.get_param()
 5897            .get("annotation", {})
 5898            .get("options", {})
 5899            .get("annotations_update", False)
 5900        )
 5901        log.debug(f"force_update_annotation={force_update_annotation}")
 5902        force_append_annotation = (
 5903            self.get_param()
 5904            .get("annotation", {})
 5905            .get("options", {})
 5906            .get("annotations_append", False)
 5907        )
 5908        log.debug(f"force_append_annotation={force_append_annotation}")
 5909
 5910        # Data
 5911        table_variants = self.get_table_variants()
 5912
 5913        # Check if not empty
 5914        log.debug("Check if not empty")
 5915        sql_query_chromosomes_df = self.get_query_to_df(
 5916            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5917        )
 5918        if not sql_query_chromosomes_df["count"][0]:
 5919            log.info(f"VCF empty")
 5920            return
 5921
 5922        # VCF header
 5923        vcf_reader = self.get_header()
 5924        log.debug("Initial header: " + str(vcf_reader.infos))
 5925
 5926        # Nb Variants POS
 5927        log.debug("NB Variants Start")
 5928        nb_variants = self.conn.execute(
 5929            f"SELECT count(*) AS count FROM variants"
 5930        ).fetchdf()["count"][0]
 5931        log.debug("NB Variants Stop")
 5932
 5933        # Existing annotations
 5934        for vcf_annotation in self.get_header().infos:
 5935
 5936            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5937            log.debug(
 5938                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5939            )
 5940
 5941        # Added columns
 5942        added_columns = []
 5943
 5944        # drop indexes
 5945        log.debug(f"Drop indexes...")
 5946        self.drop_indexes()
 5947
 5948        if annotations:
 5949
 5950            if "ALL" in annotations:
 5951
 5952                all_param = annotations.get("ALL", {})
 5953                all_param_formats = all_param.get("formats", None)
 5954                all_param_releases = all_param.get("releases", None)
 5955
 5956                databases_infos_dict = self.scan_databases(
 5957                    database_formats=all_param_formats,
 5958                    database_releases=all_param_releases,
 5959                )
 5960                for database_infos in databases_infos_dict.keys():
 5961                    if database_infos not in annotations:
 5962                        annotations[database_infos] = {"INFO": None}
 5963
 5964            for annotation in annotations:
 5965
 5966                if annotation in ["ALL"]:
 5967                    continue
 5968
 5969                # Annotation Name
 5970                annotation_name = os.path.basename(annotation)
 5971
 5972                # Annotation fields
 5973                annotation_fields = annotations[annotation]
 5974                if not annotation_fields:
 5975                    annotation_fields = {"INFO": None}
 5976
 5977                log.debug(f"Annotation '{annotation_name}'")
 5978                log.debug(
 5979                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5980                )
 5981
 5982                # Create Database
 5983                database = Database(
 5984                    database=annotation,
 5985                    databases_folders=databases_folders,
 5986                    assembly=assembly,
 5987                )
 5988
 5989                # Find files
 5990                parquet_file = database.get_database()
 5991                parquet_hdr_file = database.get_header_file()
 5992                parquet_type = database.get_type()
 5993
 5994                # Check if files exists
 5995                if not parquet_file or not parquet_hdr_file:
 5996                    msg_err_list = []
 5997                    if not parquet_file:
 5998                        msg_err_list.append(
 5999                            f"Annotation failed: Annotation file not found"
 6000                        )
 6001                    if parquet_file and not parquet_hdr_file:
 6002                        msg_err_list.append(
 6003                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6004                        )
 6005
 6006                    log.error(". ".join(msg_err_list))
 6007                    raise ValueError(". ".join(msg_err_list))
 6008                else:
 6009                    # Get parquet connexion
 6010                    parquet_sql_attach = database.get_sql_database_attach(
 6011                        output="query"
 6012                    )
 6013                    if parquet_sql_attach:
 6014                        self.conn.execute(parquet_sql_attach)
 6015                    parquet_file_link = database.get_sql_database_link()
 6016                    # Log
 6017                    log.debug(
 6018                        f"Annotation '{annotation_name}' - file: "
 6019                        + str(parquet_file)
 6020                        + " and "
 6021                        + str(parquet_hdr_file)
 6022                    )
 6023
 6024                    # Database full header columns
 6025                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6026                        parquet_hdr_file
 6027                    )
 6028                    # Log
 6029                    log.debug(
 6030                        "Annotation database header columns : "
 6031                        + str(parquet_hdr_vcf_header_columns)
 6032                    )
 6033
 6034                    # Load header as VCF object
 6035                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6036                    # Log
 6037                    log.debug(
 6038                        "Annotation database header: "
 6039                        + str(parquet_hdr_vcf_header_infos)
 6040                    )
 6041
 6042                    # Get extra infos
 6043                    parquet_columns = database.get_extra_columns()
 6044                    # Log
 6045                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6046
 6047                    # Add extra columns if "ALL" in annotation_fields
 6048                    # if "ALL" in annotation_fields:
 6049                    #     allow_add_extra_column = True
 6050                    if "ALL" in annotation_fields and database.get_extra_columns():
 6051                        for extra_column in database.get_extra_columns():
 6052                            if (
 6053                                extra_column not in annotation_fields
 6054                                and extra_column.replace("INFO/", "")
 6055                                not in parquet_hdr_vcf_header_infos
 6056                            ):
 6057                                parquet_hdr_vcf_header_infos[extra_column] = (
 6058                                    vcf.parser._Info(
 6059                                        extra_column,
 6060                                        ".",
 6061                                        "String",
 6062                                        f"{extra_column} description",
 6063                                        "unknown",
 6064                                        "unknown",
 6065                                        self.code_type_map["String"],
 6066                                    )
 6067                                )
 6068
 6069                    # For all fields in database
 6070                    annotation_fields_all = False
 6071                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6072                        annotation_fields_all = True
 6073                        annotation_fields = {
 6074                            key: key for key in parquet_hdr_vcf_header_infos
 6075                        }
 6076
 6077                        log.debug(
 6078                            "Annotation database header - All annotations added: "
 6079                            + str(annotation_fields)
 6080                        )
 6081
 6082                    # Init
 6083
 6084                    # List of annotation fields to use
 6085                    sql_query_annotation_update_info_sets = []
 6086
 6087                    # List of annotation to agregate
 6088                    sql_query_annotation_to_agregate = []
 6089
 6090                    # Number of fields
 6091                    nb_annotation_field = 0
 6092
 6093                    # Annotation fields processed
 6094                    annotation_fields_processed = []
 6095
 6096                    # Columns mapping
 6097                    map_columns = database.map_columns(
 6098                        columns=annotation_fields, prefixes=["INFO/"]
 6099                    )
 6100
 6101                    # Query dict for fields to remove (update option)
 6102                    query_dict_remove = {}
 6103
 6104                    # Fetch Anotation fields
 6105                    for annotation_field in annotation_fields:
 6106
 6107                        # annotation_field_column
 6108                        annotation_field_column = map_columns.get(
 6109                            annotation_field, "INFO"
 6110                        )
 6111
 6112                        # field new name, if parametered
 6113                        annotation_fields_new_name = annotation_fields.get(
 6114                            annotation_field, annotation_field
 6115                        )
 6116                        if not annotation_fields_new_name:
 6117                            annotation_fields_new_name = annotation_field
 6118
 6119                        # To annotate
 6120                        # force_update_annotation = True
 6121                        # force_append_annotation = True
 6122                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6123                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6124                            force_update_annotation
 6125                            or force_append_annotation
 6126                            or (
 6127                                annotation_fields_new_name
 6128                                not in self.get_header().infos
 6129                            )
 6130                        ):
 6131
 6132                            # Add field to annotation to process list
 6133                            annotation_fields_processed.append(
 6134                                annotation_fields_new_name
 6135                            )
 6136
 6137                            # explode infos for the field
 6138                            annotation_fields_new_name_info_msg = ""
 6139                            if (
 6140                                force_update_annotation
 6141                                and annotation_fields_new_name
 6142                                in self.get_header().infos
 6143                            ):
 6144                                # Remove field from INFO
 6145                                query = f"""
 6146                                    UPDATE {table_variants} as table_variants
 6147                                    SET INFO = REGEXP_REPLACE(
 6148                                                concat(table_variants.INFO,''),
 6149                                                ';*{annotation_fields_new_name}=[^;]*',
 6150                                                ''
 6151                                                )
 6152                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6153                                """
 6154                                annotation_fields_new_name_info_msg = " [update]"
 6155                                query_dict_remove[
 6156                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6157                                ] = query
 6158
 6159                            # Sep between fields in INFO
 6160                            nb_annotation_field += 1
 6161                            if nb_annotation_field > 1:
 6162                                annotation_field_sep = ";"
 6163                            else:
 6164                                annotation_field_sep = ""
 6165
 6166                            log.info(
 6167                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6168                            )
 6169
 6170                            # Add INFO field to header
 6171                            parquet_hdr_vcf_header_infos_number = (
 6172                                parquet_hdr_vcf_header_infos[annotation_field].num
 6173                                or "."
 6174                            )
 6175                            parquet_hdr_vcf_header_infos_type = (
 6176                                parquet_hdr_vcf_header_infos[annotation_field].type
 6177                                or "String"
 6178                            )
 6179                            parquet_hdr_vcf_header_infos_description = (
 6180                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6181                                or f"{annotation_field} description"
 6182                            )
 6183                            parquet_hdr_vcf_header_infos_source = (
 6184                                parquet_hdr_vcf_header_infos[annotation_field].source
 6185                                or "unknown"
 6186                            )
 6187                            parquet_hdr_vcf_header_infos_version = (
 6188                                parquet_hdr_vcf_header_infos[annotation_field].version
 6189                                or "unknown"
 6190                            )
 6191
 6192                            vcf_reader.infos[annotation_fields_new_name] = (
 6193                                vcf.parser._Info(
 6194                                    annotation_fields_new_name,
 6195                                    parquet_hdr_vcf_header_infos_number,
 6196                                    parquet_hdr_vcf_header_infos_type,
 6197                                    parquet_hdr_vcf_header_infos_description,
 6198                                    parquet_hdr_vcf_header_infos_source,
 6199                                    parquet_hdr_vcf_header_infos_version,
 6200                                    self.code_type_map[
 6201                                        parquet_hdr_vcf_header_infos_type
 6202                                    ],
 6203                                )
 6204                            )
 6205
 6206                            # Append
 6207                            if force_append_annotation:
 6208                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6209                            else:
 6210                                query_case_when_append = ""
 6211
 6212                            # Annotation/Update query fields
 6213                            # Found in INFO column
 6214                            if (
 6215                                annotation_field_column == "INFO"
 6216                                and "INFO" in parquet_hdr_vcf_header_columns
 6217                            ):
 6218                                sql_query_annotation_update_info_sets.append(
 6219                                    f"""
 6220                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6221                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6222                                        ELSE ''
 6223                                    END
 6224                                """
 6225                                )
 6226                            # Found in a specific column
 6227                            else:
 6228                                sql_query_annotation_update_info_sets.append(
 6229                                    f"""
 6230                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6231                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6232                                        ELSE ''
 6233                                    END
 6234                                """
 6235                                )
 6236                                sql_query_annotation_to_agregate.append(
 6237                                    f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6238                                )
 6239
 6240                        # Not to annotate
 6241                        else:
 6242
 6243                            if force_update_annotation:
 6244                                annotation_message = "forced"
 6245                            else:
 6246                                annotation_message = "skipped"
 6247
 6248                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6249                                log.warning(
 6250                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6251                                )
 6252                            if annotation_fields_new_name in self.get_header().infos:
 6253                                log.warning(
 6254                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6255                                )
 6256
 6257                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6258                    # allow_annotation_full_info = True
 6259                    allow_annotation_full_info = not force_append_annotation
 6260
 6261                    if parquet_type in ["regions"]:
 6262                        allow_annotation_full_info = False
 6263
 6264                    if (
 6265                        allow_annotation_full_info
 6266                        and nb_annotation_field == len(annotation_fields)
 6267                        and annotation_fields_all
 6268                        and (
 6269                            "INFO" in parquet_hdr_vcf_header_columns
 6270                            and "INFO" in database.get_extra_columns()
 6271                        )
 6272                    ):
 6273                        log.debug("Column INFO annotation enabled")
 6274                        sql_query_annotation_update_info_sets = []
 6275                        sql_query_annotation_update_info_sets.append(
 6276                            f" table_parquet.INFO "
 6277                        )
 6278
 6279                    if sql_query_annotation_update_info_sets:
 6280
 6281                        # Annotate
 6282                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6283
 6284                        # Join query annotation update info sets for SQL
 6285                        sql_query_annotation_update_info_sets_sql = ",".join(
 6286                            sql_query_annotation_update_info_sets
 6287                        )
 6288
 6289                        # Check chromosomes list (and variants infos)
 6290                        sql_query_chromosomes = f"""
 6291                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6292                            FROM {table_variants} as table_variants
 6293                            GROUP BY table_variants."#CHROM"
 6294                            ORDER BY table_variants."#CHROM"
 6295                            """
 6296                        sql_query_chromosomes_df = self.conn.execute(
 6297                            sql_query_chromosomes
 6298                        ).df()
 6299                        sql_query_chromosomes_dict = {
 6300                            entry["CHROM"]: {
 6301                                "count": entry["count_variants"],
 6302                                "min": entry["min_variants"],
 6303                                "max": entry["max_variants"],
 6304                            }
 6305                            for index, entry in sql_query_chromosomes_df.iterrows()
 6306                        }
 6307
 6308                        # Init
 6309                        nb_of_query = 0
 6310                        nb_of_variant_annotated = 0
 6311                        query_dict = query_dict_remove
 6312
 6313                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6314                        for chrom in sql_query_chromosomes_dict:
 6315
 6316                            # Number of variant by chromosome
 6317                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6318                                chrom, {}
 6319                            ).get("count", 0)
 6320
 6321                            log.debug(
 6322                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6323                            )
 6324
 6325                            # Annotation with regions database
 6326                            if parquet_type in ["regions"]:
 6327                                sql_query_annotation_from_clause = f"""
 6328                                    FROM (
 6329                                        SELECT 
 6330                                            '{chrom}' AS \"#CHROM\",
 6331                                            table_variants_from.\"POS\" AS \"POS\",
 6332                                            {",".join(sql_query_annotation_to_agregate)}
 6333                                        FROM {table_variants} as table_variants_from
 6334                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6335                                            table_parquet_from."#CHROM" = '{chrom}'
 6336                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6337                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6338                                        )
 6339                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6340                                        GROUP BY table_variants_from.\"POS\"
 6341                                        )
 6342                                        as table_parquet
 6343                                """
 6344
 6345                                sql_query_annotation_where_clause = """
 6346                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6347                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6348                                """
 6349
 6350                            # Annotation with variants database
 6351                            else:
 6352                                sql_query_annotation_from_clause = f"""
 6353                                    FROM {parquet_file_link} as table_parquet
 6354                                """
 6355                                sql_query_annotation_where_clause = f"""
 6356                                    table_variants."#CHROM" = '{chrom}'
 6357                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6358                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6359                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6360                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6361                                """
 6362
 6363                            # Create update query
 6364                            sql_query_annotation_chrom_interval_pos = f"""
 6365                                UPDATE {table_variants} as table_variants
 6366                                    SET INFO = 
 6367                                        concat(
 6368                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6369                                                THEN table_variants.INFO
 6370                                                ELSE ''
 6371                                            END
 6372                                            ,
 6373                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6374                                                        AND (
 6375                                                        concat({sql_query_annotation_update_info_sets_sql})
 6376                                                        )
 6377                                                        NOT IN ('','.') 
 6378                                                    THEN ';'
 6379                                                    ELSE ''
 6380                                            END
 6381                                            ,
 6382                                            {sql_query_annotation_update_info_sets_sql}
 6383                                            )
 6384                                    {sql_query_annotation_from_clause}
 6385                                    WHERE {sql_query_annotation_where_clause}
 6386                                    ;
 6387                                """
 6388
 6389                            # Add update query to dict
 6390                            query_dict[
 6391                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6392                            ] = sql_query_annotation_chrom_interval_pos
 6393
 6394                        nb_of_query = len(query_dict)
 6395                        num_query = 0
 6396
 6397                        # SET max_expression_depth TO x
 6398                        self.conn.execute("SET max_expression_depth TO 10000")
 6399
 6400                        for query_name in query_dict:
 6401                            query = query_dict[query_name]
 6402                            num_query += 1
 6403                            log.info(
 6404                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6405                            )
 6406                            result = self.conn.execute(query)
 6407                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6408                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6409                            log.info(
 6410                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6411                            )
 6412
 6413                        log.info(
 6414                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6415                        )
 6416
 6417                    else:
 6418
 6419                        log.info(
 6420                            f"Annotation '{annotation_name}' - No Annotations available"
 6421                        )
 6422
 6423                    log.debug("Final header: " + str(vcf_reader.infos))
 6424
 6425        # Remove added columns
 6426        for added_column in added_columns:
 6427            self.drop_column(column=added_column)
 6428
 6429    def annotation_splice(self, threads: int = None) -> None:
 6430        """
 6431        This function annotate with snpEff
 6432
 6433        :param threads: The number of threads to use
 6434        :return: the value of the variable "return_value".
 6435        """
 6436
 6437        # DEBUG
 6438        log.debug("Start annotation with splice tools")
 6439
 6440        # Threads
 6441        if not threads:
 6442            threads = self.get_threads()
 6443        log.debug("Threads: " + str(threads))
 6444
 6445        # DEBUG
 6446        delete_tmp = True
 6447        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6448            delete_tmp = False
 6449            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6450
 6451        # Config
 6452        config = self.get_config()
 6453        log.debug("Config: " + str(config))
 6454        splice_config = config.get("tools", {}).get("splice", {})
 6455        if not splice_config:
 6456            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6457            msg_err = "No Splice tool config"
 6458            raise ValueError(msg_err)
 6459        log.debug(f"splice_config: {splice_config}")
 6460
 6461        # Config - Folders - Databases
 6462        databases_folders = (
 6463            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6464        )
 6465        log.debug("Databases annotations: " + str(databases_folders))
 6466
 6467        # Splice docker image
 6468        splice_docker_image = splice_config.get("docker").get("image")
 6469
 6470        # Pull splice image if it's not already there
 6471        if not check_docker_image_exists(splice_docker_image):
 6472            log.warning(
 6473                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6474            )
 6475            try:
 6476                command(f"docker pull {splice_config.get('docker').get('image')}")
 6477            except subprocess.CalledProcessError:
 6478                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6479                log.error(msg_err)
 6480                raise ValueError(msg_err)
 6481
 6482        # Config - splice databases
 6483        splice_databases = (
 6484            config.get("folders", {})
 6485            .get("databases", {})
 6486            .get("splice", DEFAULT_SPLICE_FOLDER)
 6487        )
 6488        splice_databases = full_path(splice_databases)
 6489
 6490        # Param
 6491        param = self.get_param()
 6492        log.debug("Param: " + str(param))
 6493
 6494        # Param
 6495        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6496        log.debug("Options: " + str(options))
 6497
 6498        # Data
 6499        table_variants = self.get_table_variants()
 6500
 6501        # Check if not empty
 6502        log.debug("Check if not empty")
 6503        sql_query_chromosomes = (
 6504            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6505        )
 6506        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6507            log.info("VCF empty")
 6508            return None
 6509
 6510        # Export in VCF
 6511        log.debug("Create initial file to annotate")
 6512
 6513        # Create output folder / work folder
 6514        if options.get("output_folder", ""):
 6515            output_folder = options.get("output_folder", "")
 6516            if not os.path.exists(output_folder):
 6517                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6518        else:
 6519            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6520            if not os.path.exists(output_folder):
 6521                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6522
 6523        if options.get("workdir", ""):
 6524            workdir = options.get("workdir", "")
 6525        else:
 6526            workdir = "/work"
 6527
 6528        # Create tmp VCF file
 6529        tmp_vcf = NamedTemporaryFile(
 6530            prefix=self.get_prefix(),
 6531            dir=output_folder,
 6532            suffix=".vcf",
 6533            delete=False,
 6534        )
 6535        tmp_vcf_name = tmp_vcf.name
 6536
 6537        # VCF header
 6538        header = self.get_header()
 6539
 6540        # Existing annotations
 6541        for vcf_annotation in self.get_header().infos:
 6542
 6543            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6544            log.debug(
 6545                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6546            )
 6547
 6548        # Memory limit
 6549        if config.get("memory", None):
 6550            memory_limit = config.get("memory", "8G").upper()
 6551            # upper()
 6552        else:
 6553            memory_limit = "8G"
 6554        log.debug(f"memory_limit: {memory_limit}")
 6555
 6556        # Check number of variants to annotate
 6557        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6558        where_clause_regex_spip = r"SPiP_\w+"
 6559        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6560        df_list_of_variants_to_annotate = self.get_query_to_df(
 6561            query=f""" SELECT * FROM variants {where_clause} """
 6562        )
 6563        if len(df_list_of_variants_to_annotate) == 0:
 6564            log.warning(
 6565                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6566            )
 6567            return None
 6568        else:
 6569            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6570
 6571        # Export VCF file
 6572        self.export_variant_vcf(
 6573            vcf_file=tmp_vcf_name,
 6574            remove_info=True,
 6575            add_samples=True,
 6576            index=False,
 6577            where_clause=where_clause,
 6578        )
 6579        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6580        if any(value for value in splice_config.values() if value is None):
 6581            log.warning("At least one splice config parameter is empty")
 6582            # exit annotation_splice
 6583            return None
 6584
 6585        # Params in splice nf
 6586        def check_values(dico: dict):
 6587            """
 6588            Ensure parameters for NF splice pipeline
 6589            """
 6590            for key, val in dico.items():
 6591                if key == "genome":
 6592                    if any(
 6593                        assemb in options.get("genome", {})
 6594                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6595                    ):
 6596                        yield f"--{key} hg19"
 6597                    elif any(
 6598                        assemb in options.get("genome", {})
 6599                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6600                    ):
 6601                        yield f"--{key} hg38"
 6602                elif (
 6603                    (isinstance(val, str) and val)
 6604                    or isinstance(val, int)
 6605                    or isinstance(val, bool)
 6606                ):
 6607                    yield f"--{key} {val}"
 6608
 6609        # Genome
 6610        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6611        options["genome"] = genome
 6612        # NF params
 6613        nf_params = []
 6614        # Add options
 6615        if options:
 6616            log.debug(options)
 6617            nf_params = list(check_values(options))
 6618            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6619        else:
 6620            log.debug("No NF params provided")
 6621        # Add threads
 6622        if "threads" not in options.keys():
 6623            nf_params.append(f"--threads {threads}")
 6624        # Genome path
 6625        genome_path = find_genome(
 6626            config.get("folders", {})
 6627            .get("databases", {})
 6628            .get("genomes", DEFAULT_GENOME_FOLDER),
 6629            file=f"{genome}.fa",
 6630        )
 6631        # Add genome path
 6632        if not genome_path:
 6633            raise ValueError(
 6634                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6635            )
 6636        else:
 6637            log.debug(f"Genome: {genome_path}")
 6638            nf_params.append(f"--genome_path {genome_path}")
 6639
 6640        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6641            """
 6642            Setting up updated databases for SPiP and SpliceAI
 6643            """
 6644
 6645            try:
 6646
 6647                # SpliceAI assembly transcriptome
 6648                spliceai_assembly = os.path.join(
 6649                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6650                    options.get("genome"),
 6651                    "transcriptome",
 6652                )
 6653                spip_assembly = options.get("genome")
 6654
 6655                spip = find(
 6656                    f"transcriptome_{spip_assembly}.RData",
 6657                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6658                )
 6659                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6660                log.debug(f"SPiP annotations: {spip}")
 6661                log.debug(f"SpliceAI annotations: {spliceai}")
 6662                if spip and spliceai:
 6663                    return [
 6664                        f"--spip_transcriptome {spip}",
 6665                        f"--spliceai_transcriptome {spliceai}",
 6666                    ]
 6667                else:
 6668                    log.warning(
 6669                        "Can't find splice databases in configuration, use annotations file from image"
 6670                    )
 6671            except TypeError:
 6672                log.warning(
 6673                    "Can't find splice databases in configuration, use annotations file from image"
 6674                )
 6675                return []
 6676
 6677        # Add options, check if transcriptome option have already beend provided
 6678        if (
 6679            "spip_transcriptome" not in nf_params
 6680            and "spliceai_transcriptome" not in nf_params
 6681        ):
 6682            splice_reference = splice_annotations(options, config)
 6683            if splice_reference:
 6684                nf_params.extend(splice_reference)
 6685        # nf_params.append(f"--output_folder {output_folder}")
 6686        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6687        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6688        log.debug(cmd)
 6689        splice_config["docker"]["command"] = cmd
 6690
 6691        # Ensure proxy is set
 6692        proxy = [
 6693            f"-e {var}={os.getenv(var)}"
 6694            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6695            if os.getenv(var) is not None
 6696        ]
 6697        docker_cmd = get_bin_command(
 6698            tool="splice",
 6699            bin_type="docker",
 6700            config=config,
 6701            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6702            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6703        )
 6704        # print(docker_cmd)
 6705        # exit()
 6706        # Docker debug
 6707        # if splice_config.get("rm_container"):
 6708        #     rm_container = "--rm"
 6709        # else:
 6710        #     rm_container = ""
 6711        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6712        log.debug(docker_cmd)
 6713        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6714        log.debug(res.stdout)
 6715        if res.stderr:
 6716            log.error(res.stderr)
 6717        res.check_returncode()
 6718        # Update variants
 6719        log.info("Annotation - Updating...")
 6720        # Test find output vcf
 6721        log.debug(
 6722            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6723        )
 6724        output_vcf = []
 6725        # Wrong folder to look in
 6726        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6727            if (
 6728                files
 6729                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6730            ):
 6731                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6732        # log.debug(os.listdir(options.get("output_folder")))
 6733        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6734        if not output_vcf:
 6735            log.debug(
 6736                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6737            )
 6738        else:
 6739            # Get new header from annotated vcf
 6740            log.debug(f"Initial header: {len(header.infos)} fields")
 6741            # Create new header with splice infos
 6742            new_vcf = Variants(input=output_vcf[0])
 6743            new_vcf_header = new_vcf.get_header().infos
 6744            for keys, infos in new_vcf_header.items():
 6745                if keys not in header.infos.keys():
 6746                    header.infos[keys] = infos
 6747            log.debug(f"New header: {len(header.infos)} fields")
 6748            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6749            self.update_from_vcf(output_vcf[0])
 6750
 6751        # Remove file
 6752        remove_if_exists(output_vcf)
 6753
 6754    ###
 6755    # Prioritization
 6756    ###
 6757
 6758    def get_config_default(self, name: str) -> dict:
 6759        """
 6760        The function `get_config_default` returns a dictionary containing default configurations for
 6761        various calculations and prioritizations.
 6762
 6763        :param name: The `get_config_default` function returns a dictionary containing default
 6764        configurations for different calculations and prioritizations. The `name` parameter is used to
 6765        specify which specific configuration to retrieve from the dictionary
 6766        :type name: str
 6767        :return: The function `get_config_default` returns a dictionary containing default configuration
 6768        settings for different calculations and prioritizations. The specific configuration settings are
 6769        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6770        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6771        returned. If there is no match, an empty dictionary is returned.
 6772        """
 6773
 6774        config_default = {
 6775            "calculations": {
 6776                "variant_chr_pos_alt_ref": {
 6777                    "type": "sql",
 6778                    "name": "variant_chr_pos_alt_ref",
 6779                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6780                    "available": False,
 6781                    "output_column_name": "variant_chr_pos_alt_ref",
 6782                    "output_column_type": "String",
 6783                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6784                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6785                    "operation_info": True,
 6786                },
 6787                "VARTYPE": {
 6788                    "type": "sql",
 6789                    "name": "VARTYPE",
 6790                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6791                    "available": True,
 6792                    "table": "variants",
 6793                    "output_column_name": "VARTYPE",
 6794                    "output_column_type": "String",
 6795                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6796                    "operation_query": """
 6797                            CASE
 6798                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6799                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6800                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6801                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6802                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6803                                ELSE 'UNDEFINED'
 6804                            END
 6805                            """,
 6806                    "info_fields": ["SVTYPE"],
 6807                    "operation_info": True,
 6808                },
 6809                "snpeff_hgvs": {
 6810                    "type": "python",
 6811                    "name": "snpeff_hgvs",
 6812                    "description": "HGVS nomenclatures from snpEff annotation",
 6813                    "available": True,
 6814                    "function_name": "calculation_extract_snpeff_hgvs",
 6815                    "function_params": ["snpeff_hgvs", "ANN"],
 6816                },
 6817                "snpeff_ann_explode": {
 6818                    "type": "python",
 6819                    "name": "snpeff_ann_explode",
 6820                    "description": "Explode snpEff annotations with uniquify values",
 6821                    "available": True,
 6822                    "function_name": "calculation_snpeff_ann_explode",
 6823                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6824                },
 6825                "snpeff_ann_explode_uniquify": {
 6826                    "type": "python",
 6827                    "name": "snpeff_ann_explode_uniquify",
 6828                    "description": "Explode snpEff annotations",
 6829                    "available": True,
 6830                    "function_name": "calculation_snpeff_ann_explode",
 6831                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6832                },
 6833                "snpeff_ann_explode_json": {
 6834                    "type": "python",
 6835                    "name": "snpeff_ann_explode_json",
 6836                    "description": "Explode snpEff annotations in JSON format",
 6837                    "available": True,
 6838                    "function_name": "calculation_snpeff_ann_explode",
 6839                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6840                },
 6841                "NOMEN": {
 6842                    "type": "python",
 6843                    "name": "NOMEN",
 6844                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6845                    "available": True,
 6846                    "function_name": "calculation_extract_nomen",
 6847                    "function_params": [],
 6848                },
 6849                "RENAME_INFO_FIELDS": {
 6850                    "type": "python",
 6851                    "name": "RENAME_INFO_FIELDS",
 6852                    "description": "Rename or remove INFO/tags",
 6853                    "available": True,
 6854                    "function_name": "calculation_rename_info_fields",
 6855                    "function_params": [],
 6856                },
 6857                "FINDBYPIPELINE": {
 6858                    "type": "python",
 6859                    "name": "FINDBYPIPELINE",
 6860                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6861                    "available": True,
 6862                    "function_name": "calculation_find_by_pipeline",
 6863                    "function_params": ["findbypipeline"],
 6864                },
 6865                "FINDBYSAMPLE": {
 6866                    "type": "python",
 6867                    "name": "FINDBYSAMPLE",
 6868                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6869                    "available": True,
 6870                    "function_name": "calculation_find_by_pipeline",
 6871                    "function_params": ["findbysample"],
 6872                },
 6873                "GENOTYPECONCORDANCE": {
 6874                    "type": "python",
 6875                    "name": "GENOTYPECONCORDANCE",
 6876                    "description": "Concordance of genotype for multi caller VCF",
 6877                    "available": True,
 6878                    "function_name": "calculation_genotype_concordance",
 6879                    "function_params": [],
 6880                },
 6881                "BARCODE": {
 6882                    "type": "python",
 6883                    "name": "BARCODE",
 6884                    "description": "BARCODE as VaRank tool",
 6885                    "available": True,
 6886                    "function_name": "calculation_barcode",
 6887                    "function_params": [],
 6888                },
 6889                "BARCODEFAMILY": {
 6890                    "type": "python",
 6891                    "name": "BARCODEFAMILY",
 6892                    "description": "BARCODEFAMILY as VaRank tool",
 6893                    "available": True,
 6894                    "function_name": "calculation_barcode_family",
 6895                    "function_params": ["BCF"],
 6896                },
 6897                "TRIO": {
 6898                    "type": "python",
 6899                    "name": "TRIO",
 6900                    "description": "Inheritance for a trio family",
 6901                    "available": True,
 6902                    "function_name": "calculation_trio",
 6903                    "function_params": [],
 6904                },
 6905                "VAF": {
 6906                    "type": "python",
 6907                    "name": "VAF",
 6908                    "description": "Variant Allele Frequency (VAF) harmonization",
 6909                    "available": True,
 6910                    "function_name": "calculation_vaf_normalization",
 6911                    "function_params": [],
 6912                },
 6913                "VAF_stats": {
 6914                    "type": "python",
 6915                    "name": "VAF_stats",
 6916                    "description": "Variant Allele Frequency (VAF) statistics",
 6917                    "available": True,
 6918                    "function_name": "calculation_genotype_stats",
 6919                    "function_params": ["VAF"],
 6920                },
 6921                "DP_stats": {
 6922                    "type": "python",
 6923                    "name": "DP_stats",
 6924                    "description": "Depth (DP) statistics",
 6925                    "available": True,
 6926                    "function_name": "calculation_genotype_stats",
 6927                    "function_params": ["DP"],
 6928                },
 6929                "variant_id": {
 6930                    "type": "python",
 6931                    "name": "variant_id",
 6932                    "description": "Variant ID generated from variant position and type",
 6933                    "available": True,
 6934                    "function_name": "calculation_variant_id",
 6935                    "function_params": [],
 6936                },
 6937                "transcripts_json": {
 6938                    "type": "python",
 6939                    "name": "transcripts_json",
 6940                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6941                    "available": True,
 6942                    "function_name": "calculation_transcripts_annotation",
 6943                    "function_params": ["transcripts_json", None],
 6944                },
 6945                "transcripts_ann": {
 6946                    "type": "python",
 6947                    "name": "transcripts_ann",
 6948                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6949                    "available": True,
 6950                    "function_name": "calculation_transcripts_annotation",
 6951                    "function_params": [None, "transcripts_ann"],
 6952                },
 6953                "transcripts_annotations": {
 6954                    "type": "python",
 6955                    "name": "transcripts_annotations",
 6956                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6957                    "available": True,
 6958                    "function_name": "calculation_transcripts_annotation",
 6959                    "function_params": [None, None],
 6960                },
 6961                "transcripts_prioritization": {
 6962                    "type": "python",
 6963                    "name": "transcripts_prioritization",
 6964                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6965                    "available": True,
 6966                    "function_name": "calculation_transcripts_prioritization",
 6967                    "function_params": [],
 6968                },
 6969                "transcripts_export": {
 6970                    "type": "python",
 6971                    "name": "transcripts_export",
 6972                    "description": "Export transcripts table/view as a file (using param.json)",
 6973                    "available": True,
 6974                    "function_name": "calculation_transcripts_export",
 6975                    "function_params": [],
 6976                },
 6977            },
 6978            "prioritizations": {
 6979                "default": {
 6980                    "ANN2": [
 6981                        {
 6982                            "type": "contains",
 6983                            "value": "HIGH",
 6984                            "score": 5,
 6985                            "flag": "PASS",
 6986                            "comment": [
 6987                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6988                            ],
 6989                        },
 6990                        {
 6991                            "type": "contains",
 6992                            "value": "MODERATE",
 6993                            "score": 3,
 6994                            "flag": "PASS",
 6995                            "comment": [
 6996                                "A non-disruptive variant that might change protein effectiveness"
 6997                            ],
 6998                        },
 6999                        {
 7000                            "type": "contains",
 7001                            "value": "LOW",
 7002                            "score": 0,
 7003                            "flag": "FILTERED",
 7004                            "comment": [
 7005                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7006                            ],
 7007                        },
 7008                        {
 7009                            "type": "contains",
 7010                            "value": "MODIFIER",
 7011                            "score": 0,
 7012                            "flag": "FILTERED",
 7013                            "comment": [
 7014                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7015                            ],
 7016                        },
 7017                    ],
 7018                }
 7019            },
 7020        }
 7021
 7022        return config_default.get(name, None)
 7023
 7024    def get_config_json(
 7025        self, name: str, config_dict: dict = {}, config_file: str = None
 7026    ) -> dict:
 7027        """
 7028        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7029        default values, a dictionary, and a file.
 7030
 7031        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7032        the name of the configuration. It is used to identify and retrieve the configuration settings
 7033        for a specific component or module
 7034        :type name: str
 7035        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7036        dictionary that allows you to provide additional configuration settings or overrides. When you
 7037        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7038        the key is the configuration setting you want to override or
 7039        :type config_dict: dict
 7040        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7041        specify the path to a configuration file that contains additional settings. If provided, the
 7042        function will read the contents of this file and update the configuration dictionary with the
 7043        values found in the file, overriding any existing values with the
 7044        :type config_file: str
 7045        :return: The function `get_config_json` returns a dictionary containing the configuration
 7046        settings.
 7047        """
 7048
 7049        # Create with default prioritizations
 7050        config_default = self.get_config_default(name=name)
 7051        configuration = config_default
 7052        # log.debug(f"configuration={configuration}")
 7053
 7054        # Replace prioritizations from dict
 7055        for config in config_dict:
 7056            configuration[config] = config_dict[config]
 7057
 7058        # Replace prioritizations from file
 7059        config_file = full_path(config_file)
 7060        if config_file:
 7061            if os.path.exists(config_file):
 7062                with open(config_file) as config_file_content:
 7063                    config_file_dict = yaml.safe_load(config_file_content)
 7064                for config in config_file_dict:
 7065                    configuration[config] = config_file_dict[config]
 7066            else:
 7067                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7068                log.error(msg_error)
 7069                raise ValueError(msg_error)
 7070
 7071        return configuration
 7072
 7073    def prioritization(
 7074        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7075    ) -> bool:
 7076        """
 7077        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7078        prioritizes variants based on configured profiles and criteria.
 7079
 7080        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7081        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7082        a table name is provided, the method will prioritize the variants in that specific table
 7083        :type table: str
 7084        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7085        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7086        provided, the code will use a default prefix value of "PZ"
 7087        :type pz_prefix: str
 7088        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7089        additional parameters specific to the prioritization process. These parameters can include
 7090        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7091        configurations needed for the prioritization of variants in a V
 7092        :type pz_param: dict
 7093        :return: A boolean value (True) is being returned from the `prioritization` function.
 7094        """
 7095
 7096        # Config
 7097        config = self.get_config()
 7098
 7099        # Param
 7100        param = self.get_param()
 7101
 7102        # Prioritization param
 7103        if pz_param is not None:
 7104            prioritization_param = pz_param
 7105        else:
 7106            prioritization_param = param.get("prioritization", {})
 7107
 7108        # Configuration profiles
 7109        prioritization_config_file = prioritization_param.get(
 7110            "prioritization_config", None
 7111        )
 7112        prioritization_config_file = full_path(prioritization_config_file)
 7113        prioritizations_config = self.get_config_json(
 7114            name="prioritizations", config_file=prioritization_config_file
 7115        )
 7116
 7117        # Prioritization prefix
 7118        pz_prefix_default = "PZ"
 7119        if pz_prefix is None:
 7120            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7121
 7122        # Prioritization options
 7123        profiles = prioritization_param.get("profiles", [])
 7124        if isinstance(profiles, str):
 7125            profiles = profiles.split(",")
 7126        pzfields = prioritization_param.get(
 7127            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7128        )
 7129        if isinstance(pzfields, str):
 7130            pzfields = pzfields.split(",")
 7131        default_profile = prioritization_param.get("default_profile", None)
 7132        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7133        prioritization_score_mode = prioritization_param.get(
 7134            "prioritization_score_mode", "HOWARD"
 7135        )
 7136
 7137        # Quick Prioritizations
 7138        prioritizations = param.get("prioritizations", None)
 7139        if prioritizations:
 7140            log.info("Quick Prioritization:")
 7141            for profile in prioritizations.split(","):
 7142                if profile not in profiles:
 7143                    profiles.append(profile)
 7144                    log.info(f"   {profile}")
 7145
 7146        # If profile "ALL" provided, all profiles in the config profiles
 7147        if "ALL" in profiles:
 7148            profiles = list(prioritizations_config.keys())
 7149
 7150        for profile in profiles:
 7151            if prioritizations_config.get(profile, None):
 7152                log.debug(f"Profile '{profile}' configured")
 7153            else:
 7154                msg_error = f"Profile '{profile}' NOT configured"
 7155                log.error(msg_error)
 7156                raise ValueError(msg_error)
 7157
 7158        if profiles:
 7159            log.info(f"Prioritization... ")
 7160        else:
 7161            log.debug(f"No profile defined")
 7162            return False
 7163
 7164        if not default_profile and len(profiles):
 7165            default_profile = profiles[0]
 7166
 7167        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7168        log.debug("Profiles to check: " + str(list(profiles)))
 7169
 7170        # Variables
 7171        if table is not None:
 7172            table_variants = table
 7173        else:
 7174            table_variants = self.get_table_variants(clause="update")
 7175        log.debug(f"Table to prioritize: {table_variants}")
 7176
 7177        # Added columns
 7178        added_columns = []
 7179
 7180        # Create list of PZfields
 7181        # List of PZFields
 7182        list_of_pzfields_original = pzfields + [
 7183            pzfield + pzfields_sep + profile
 7184            for pzfield in pzfields
 7185            for profile in profiles
 7186        ]
 7187        list_of_pzfields = []
 7188        log.debug(f"{list_of_pzfields_original}")
 7189
 7190        # Remove existing PZfields to use if exists
 7191        for pzfield in list_of_pzfields_original:
 7192            if self.get_header().infos.get(pzfield, None) is None:
 7193                list_of_pzfields.append(pzfield)
 7194                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7195            else:
 7196                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7197
 7198        if list_of_pzfields:
 7199
 7200            # Explode Infos prefix
 7201            explode_infos_prefix = self.get_explode_infos_prefix()
 7202
 7203            # PZfields tags description
 7204            PZfields_INFOS = {
 7205                f"{pz_prefix}Tags": {
 7206                    "ID": f"{pz_prefix}Tags",
 7207                    "Number": ".",
 7208                    "Type": "String",
 7209                    "Description": "Variant tags based on annotation criteria",
 7210                },
 7211                f"{pz_prefix}Score": {
 7212                    "ID": f"{pz_prefix}Score",
 7213                    "Number": 1,
 7214                    "Type": "Integer",
 7215                    "Description": "Variant score based on annotation criteria",
 7216                },
 7217                f"{pz_prefix}Flag": {
 7218                    "ID": f"{pz_prefix}Flag",
 7219                    "Number": 1,
 7220                    "Type": "String",
 7221                    "Description": "Variant flag based on annotation criteria",
 7222                },
 7223                f"{pz_prefix}Comment": {
 7224                    "ID": f"{pz_prefix}Comment",
 7225                    "Number": ".",
 7226                    "Type": "String",
 7227                    "Description": "Variant comment based on annotation criteria",
 7228                },
 7229                f"{pz_prefix}Infos": {
 7230                    "ID": f"{pz_prefix}Infos",
 7231                    "Number": ".",
 7232                    "Type": "String",
 7233                    "Description": "Variant infos based on annotation criteria",
 7234                },
 7235                f"{pz_prefix}Class": {
 7236                    "ID": f"{pz_prefix}Class",
 7237                    "Number": ".",
 7238                    "Type": "String",
 7239                    "Description": "Variant class based on annotation criteria",
 7240                },
 7241            }
 7242
 7243            # Create INFO fields if not exist
 7244            for field in PZfields_INFOS:
 7245                field_ID = PZfields_INFOS[field]["ID"]
 7246                field_description = PZfields_INFOS[field]["Description"]
 7247                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7248                    field_description = (
 7249                        PZfields_INFOS[field]["Description"]
 7250                        + f", profile {default_profile}"
 7251                    )
 7252                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7253                        field_ID,
 7254                        PZfields_INFOS[field]["Number"],
 7255                        PZfields_INFOS[field]["Type"],
 7256                        field_description,
 7257                        "unknown",
 7258                        "unknown",
 7259                        code_type_map[PZfields_INFOS[field]["Type"]],
 7260                    )
 7261
 7262            # Create INFO fields if not exist for each profile
 7263            for profile in prioritizations_config:
 7264                if profile in profiles or profiles == []:
 7265                    for field in PZfields_INFOS:
 7266                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7267                        field_description = (
 7268                            PZfields_INFOS[field]["Description"]
 7269                            + f", profile {profile}"
 7270                        )
 7271                        if (
 7272                            field_ID not in self.get_header().infos
 7273                            and field in pzfields
 7274                        ):
 7275                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7276                                field_ID,
 7277                                PZfields_INFOS[field]["Number"],
 7278                                PZfields_INFOS[field]["Type"],
 7279                                field_description,
 7280                                "unknown",
 7281                                "unknown",
 7282                                code_type_map[PZfields_INFOS[field]["Type"]],
 7283                            )
 7284
 7285            # Header
 7286            for pzfield in list_of_pzfields:
 7287                if re.match(f"{pz_prefix}Score.*", pzfield):
 7288                    added_column = self.add_column(
 7289                        table_name=table_variants,
 7290                        column_name=pzfield,
 7291                        column_type="INTEGER",
 7292                        default_value="0",
 7293                    )
 7294                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7295                    added_column = self.add_column(
 7296                        table_name=table_variants,
 7297                        column_name=pzfield,
 7298                        column_type="BOOLEAN",
 7299                        default_value="1",
 7300                    )
 7301                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7302                    added_column = self.add_column(
 7303                        table_name=table_variants,
 7304                        column_name=pzfield,
 7305                        column_type="VARCHAR[]",
 7306                        default_value="null",
 7307                    )
 7308                else:
 7309                    added_column = self.add_column(
 7310                        table_name=table_variants,
 7311                        column_name=pzfield,
 7312                        column_type="STRING",
 7313                        default_value="''",
 7314                    )
 7315                added_columns.append(added_column)
 7316
 7317            # Profiles
 7318            if profiles:
 7319
 7320                # foreach profile in configuration file
 7321                for profile in prioritizations_config:
 7322
 7323                    # If profile is asked in param, or ALL are asked (empty profile [])
 7324                    if profile in profiles or profiles == []:
 7325                        log.info(f"Profile '{profile}'")
 7326
 7327                        sql_set_info_option = ""
 7328
 7329                        sql_set_info = []
 7330
 7331                        # PZ fields set
 7332
 7333                        # PZScore
 7334                        if (
 7335                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7336                            in list_of_pzfields
 7337                        ):
 7338                            sql_set_info.append(
 7339                                f"""
 7340                                    concat(
 7341                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7342                                        {pz_prefix}Score{pzfields_sep}{profile}
 7343                                    ) 
 7344                                """
 7345                            )
 7346                            if (
 7347                                profile == default_profile
 7348                                and f"{pz_prefix}Score" in list_of_pzfields
 7349                            ):
 7350                                sql_set_info.append(
 7351                                    f"""
 7352                                        concat(
 7353                                            '{pz_prefix}Score=',
 7354                                            {pz_prefix}Score{pzfields_sep}{profile}
 7355                                        )
 7356                                    """
 7357                                )
 7358
 7359                        # PZFlag
 7360                        if (
 7361                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7362                            in list_of_pzfields
 7363                        ):
 7364                            sql_set_info.append(
 7365                                f"""
 7366                                    concat(
 7367                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7368                                        CASE 
 7369                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7370                                            THEN 'PASS'
 7371                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7372                                            THEN 'FILTERED'
 7373                                        END
 7374                                    ) 
 7375                                """
 7376                            )
 7377                            if (
 7378                                profile == default_profile
 7379                                and f"{pz_prefix}Flag" in list_of_pzfields
 7380                            ):
 7381                                sql_set_info.append(
 7382                                    f"""
 7383                                        concat(
 7384                                            '{pz_prefix}Flag=',
 7385                                            CASE 
 7386                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7387                                                THEN 'PASS'
 7388                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7389                                                THEN 'FILTERED'
 7390                                            END
 7391                                        )
 7392                                    """
 7393                                )
 7394
 7395                        # PZClass
 7396                        if (
 7397                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7398                            in list_of_pzfields
 7399                        ):
 7400                            sql_set_info.append(
 7401                                f"""
 7402                                    concat(
 7403                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7404                                        CASE
 7405                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7406                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7407                                            ELSE '.'
 7408                                        END 
 7409                                    )
 7410                                    
 7411                                """
 7412                            )
 7413                            if (
 7414                                profile == default_profile
 7415                                and f"{pz_prefix}Class" in list_of_pzfields
 7416                            ):
 7417                                sql_set_info.append(
 7418                                    f"""
 7419                                        concat(
 7420                                            '{pz_prefix}Class=',
 7421                                            CASE
 7422                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7423                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7424                                                ELSE '.'
 7425                                            END 
 7426                                        )
 7427                                    """
 7428                                )
 7429
 7430                        # PZComment
 7431                        if (
 7432                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7433                            in list_of_pzfields
 7434                        ):
 7435                            sql_set_info.append(
 7436                                f"""
 7437                                    CASE
 7438                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7439                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7440                                        ELSE ''
 7441                                    END
 7442                                """
 7443                            )
 7444                            if (
 7445                                profile == default_profile
 7446                                and f"{pz_prefix}Comment" in list_of_pzfields
 7447                            ):
 7448                                sql_set_info.append(
 7449                                    f"""
 7450                                        CASE
 7451                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7452                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7453                                            ELSE ''
 7454                                        END
 7455                                    """
 7456                                )
 7457
 7458                        # PZInfos
 7459                        if (
 7460                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7461                            in list_of_pzfields
 7462                        ):
 7463                            sql_set_info.append(
 7464                                f"""
 7465                                    CASE
 7466                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7467                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7468                                        ELSE ''
 7469                                    END
 7470                                """
 7471                            )
 7472                            if (
 7473                                profile == default_profile
 7474                                and f"{pz_prefix}Infos" in list_of_pzfields
 7475                            ):
 7476                                sql_set_info.append(
 7477                                    f"""
 7478                                        CASE
 7479                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7480                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7481                                            ELSE ''
 7482                                        END
 7483                                    """
 7484                                )
 7485
 7486                        # Merge PZfields
 7487                        sql_set_info_option = ""
 7488                        sql_set_sep = ""
 7489                        for sql_set in sql_set_info:
 7490                            if sql_set_sep:
 7491                                sql_set_info_option += f"""
 7492                                    , concat('{sql_set_sep}', {sql_set})
 7493                                """
 7494                            else:
 7495                                sql_set_info_option += f"""
 7496                                    , {sql_set}
 7497                                """
 7498                            sql_set_sep = ";"
 7499
 7500                        sql_queries = []
 7501                        criterion_fields_profile = []
 7502                        annotation_view_name = (
 7503                            "annotation_view_for_prioritization_"
 7504                            + str(random.randrange(1000))
 7505                        )
 7506                        annotation_view_prefix = ""
 7507                        for annotation in prioritizations_config[profile]:
 7508
 7509                            # skip special sections
 7510                            if annotation.startswith("_"):
 7511                                continue
 7512
 7513                            # For each criterions
 7514                            for criterion in prioritizations_config[profile][
 7515                                annotation
 7516                            ]:
 7517
 7518                                # Criterion mode
 7519                                criterion_mode = None
 7520                                if np.any(
 7521                                    np.isin(list(criterion.keys()), ["type", "value"])
 7522                                ):
 7523                                    criterion_mode = "operation"
 7524                                elif np.any(
 7525                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7526                                ):
 7527                                    criterion_mode = "sql"
 7528                                log.debug(f"Criterion Mode: {criterion_mode}")
 7529
 7530                                # Criterion parameters
 7531                                criterion_type = criterion.get("type", None)
 7532                                criterion_value = criterion.get("value", None)
 7533                                criterion_sql = criterion.get("sql", None)
 7534                                criterion_fields = criterion.get("fields", None)
 7535                                criterion_score = criterion.get("score", 0)
 7536                                criterion_flag = criterion.get("flag", "PASS")
 7537                                criterion_class = criterion.get("class", None)
 7538                                criterion_flag_bool = criterion_flag == "PASS"
 7539                                criterion_comment = (
 7540                                    ", ".join(criterion.get("comment", []))
 7541                                    .replace("'", "''")
 7542                                    .replace(";", ",")
 7543                                    .replace("\t", " ")
 7544                                )
 7545                                criterion_infos = (
 7546                                    str(criterion)
 7547                                    .replace("'", "''")
 7548                                    .replace(";", ",")
 7549                                    .replace("\t", " ")
 7550                                )
 7551
 7552                                # SQL
 7553                                if criterion_sql is not None and isinstance(
 7554                                    criterion_sql, list
 7555                                ):
 7556                                    criterion_sql = " ".join(criterion_sql)
 7557
 7558                                # Fields and explode
 7559                                if criterion_fields is None:
 7560                                    criterion_fields = [annotation]
 7561                                if not isinstance(criterion_fields, list):
 7562                                    criterion_fields = str(criterion_fields).split(",")
 7563
 7564                                # Class
 7565                                if criterion_class is not None and not isinstance(
 7566                                    criterion_class, list
 7567                                ):
 7568                                    criterion_class = str(criterion_class).split(",")
 7569
 7570                                # Add criterion fields to the list of profile's criteria
 7571                                criterion_fields_profile = list(
 7572                                    set(criterion_fields_profile + criterion_fields)
 7573                                )
 7574
 7575                                sql_set = []
 7576                                sql_set_info = []
 7577
 7578                                # PZ fields set
 7579
 7580                                # PZScore
 7581                                if (
 7582                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7583                                    in list_of_pzfields
 7584                                ):
 7585                                    # VaRank prioritization score mode
 7586                                    if prioritization_score_mode.upper().strip() in [
 7587                                        "VARANK",
 7588                                        "MAX",
 7589                                        "MAXIMUM",
 7590                                        "TOP",
 7591                                    ]:
 7592                                        sql_set.append(
 7593                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7594                                        )
 7595                                    # default HOWARD prioritization score mode
 7596                                    else:
 7597                                        sql_set.append(
 7598                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7599                                        )
 7600
 7601                                # PZFlag
 7602                                if (
 7603                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7604                                    in list_of_pzfields
 7605                                ):
 7606                                    sql_set.append(
 7607                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7608                                    )
 7609
 7610                                # PZClass
 7611                                if (
 7612                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7613                                    in list_of_pzfields
 7614                                    and criterion_class is not None
 7615                                ):
 7616                                    sql_set.append(
 7617                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7618                                    )
 7619
 7620                                # PZComment
 7621                                if (
 7622                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7623                                    in list_of_pzfields
 7624                                ):
 7625                                    sql_set.append(
 7626                                        f"""
 7627                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7628                                                concat(
 7629                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7630                                                    CASE 
 7631                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7632                                                        THEN ', '
 7633                                                        ELSE ''
 7634                                                    END,
 7635                                                    '{criterion_comment}'
 7636                                                )
 7637                                        """
 7638                                    )
 7639
 7640                                # PZInfos
 7641                                if (
 7642                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7643                                    in list_of_pzfields
 7644                                ):
 7645                                    sql_set.append(
 7646                                        f"""
 7647                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7648                                                concat(
 7649                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7650                                                    '{criterion_infos}'
 7651                                                )
 7652                                        """
 7653                                    )
 7654                                sql_set_option = ",".join(sql_set)
 7655
 7656                                # Criterion and comparison
 7657                                if sql_set_option:
 7658
 7659                                    # Operation mode
 7660                                    if criterion_mode in ["operation"]:
 7661
 7662                                        # Check if value is a float
 7663                                        try:
 7664                                            float(criterion_value)
 7665                                            sql_update = f"""
 7666                                                UPDATE "{table_variants}"
 7667                                                SET {sql_set_option}
 7668                                                FROM (
 7669                                                    SELECT *
 7670                                                    FROM "{annotation_view_name}"
 7671                                                    WHERE (
 7672                                                        CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7673                                                        AND   CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7674                                                        )
 7675                                                    ) AS "{annotation_view_name}"
 7676                                                WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
 7677                                                  AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
 7678                                                  AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
 7679                                                  AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
 7680                                                
 7681                                            """
 7682                                        # If not a floatÃ’
 7683                                        except:
 7684                                            contains_option = ""
 7685                                            if criterion_type == "contains":
 7686                                                contains_option = ".*"
 7687                                            sql_update = f"""
 7688                                                UPDATE "{table_variants}"
 7689                                                SET {sql_set_option}
 7690                                                FROM (
 7691                                                    SELECT *
 7692                                                    FROM "{annotation_view_name}"
 7693                                                    WHERE (
 7694                                                        "{annotation_view_name}"."{annotation_view_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7695                                                        )
 7696                                                    ) AS "{annotation_view_name}"
 7697                                                WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
 7698                                                  AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
 7699                                                  AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
 7700                                                  AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
 7701                                                  
 7702                                            """
 7703                                        sql_queries.append(sql_update)
 7704
 7705                                    # SQL mode
 7706                                    elif criterion_mode in ["sql"]:
 7707
 7708                                        sql_update = f"""
 7709                                            UPDATE {table_variants}
 7710                                            SET {sql_set_option}
 7711                                            FROM (
 7712                                                SELECT *
 7713                                                FROM "{annotation_view_name}"
 7714                                                WHERE ({criterion_sql})
 7715                                                ) AS "{annotation_view_name}"
 7716                                            WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
 7717                                                AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
 7718                                                AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
 7719                                                AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
 7720                                        """
 7721                                        sql_queries.append(sql_update)
 7722
 7723                                    else:
 7724                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7725                                        log.error(msg_err)
 7726                                        raise ValueError(msg_err)
 7727
 7728                                else:
 7729                                    log.warning(
 7730                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7731                                    )
 7732
 7733                        # PZTags
 7734                        if (
 7735                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7736                            in list_of_pzfields
 7737                        ):
 7738
 7739                            # Create PZFalgs value
 7740                            pztags_value = ""
 7741                            pztags_sep_default = ","
 7742                            pztags_sep = ""
 7743                            for pzfield in pzfields:
 7744                                if pzfield not in [f"{pz_prefix}Tags"]:
 7745                                    if (
 7746                                        f"{pzfield}{pzfields_sep}{profile}"
 7747                                        in list_of_pzfields
 7748                                    ):
 7749                                        if pzfield in [f"{pz_prefix}Flag"]:
 7750                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7751                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7752                                                    THEN 'PASS'
 7753                                                    ELSE 'FILTERED'
 7754                                                END, '"""
 7755                                        elif pzfield in [f"{pz_prefix}Class"]:
 7756                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7757                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7758                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7759                                                    ELSE '.'
 7760                                                END, '"""
 7761                                        else:
 7762                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7763                                        pztags_sep = pztags_sep_default
 7764
 7765                            # Add Query update for PZFlags
 7766                            sql_update_pztags = f"""
 7767                                UPDATE {table_variants}
 7768                                SET INFO = concat(
 7769                                        INFO,
 7770                                        CASE WHEN INFO NOT in ('','.')
 7771                                                THEN ';'
 7772                                                ELSE ''
 7773                                        END,
 7774                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7775                                    )
 7776                                WHERE 1=1
 7777                                """
 7778                            sql_queries.append(sql_update_pztags)
 7779
 7780                            # Add Query update for PZFlags for default
 7781                            if profile == default_profile:
 7782                                sql_update_pztags_default = f"""
 7783                                UPDATE {table_variants}
 7784                                SET INFO = concat(
 7785                                        INFO,
 7786                                        ';',
 7787                                        '{pz_prefix}Tags={pztags_value}'
 7788                                    )
 7789                                    WHERE 1=1
 7790                                """
 7791                                sql_queries.append(sql_update_pztags_default)
 7792
 7793                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7794
 7795                        # Create annotations view for prioritization
 7796                        log.debug(
 7797                            f"""Profile '{profile}' - Prioritization - Create '{annotation_view_name}' view with '{criterion_fields_profile}'... """
 7798                        )
 7799                        annotation_view = self.create_annotations_view(
 7800                            view=annotation_view_name,
 7801                            prefix=annotation_view_prefix,
 7802                            fields=criterion_fields_profile,
 7803                            drop_view=True,
 7804                        )
 7805
 7806                        # Chromosomes list
 7807                        sql_uniq_chrom = f"""
 7808                            SELECT DISTINCT "#CHROM"
 7809                            FROM {table_variants}
 7810                        """
 7811                        chroms = self.get_query_to_df(sql_uniq_chrom)["#CHROM"].tolist()
 7812
 7813                        for chrom in chroms:
 7814
 7815                            log.debug(
 7816                                f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}'... """
 7817                            )
 7818
 7819                            if sql_queries:
 7820
 7821                                # Query num
 7822                                num_query = 0
 7823
 7824                                # For each query
 7825                                for sql_query in sql_queries:
 7826
 7827                                    # Query num
 7828                                    num_query += 1
 7829
 7830                                    sql_query_chrom = f"""
 7831                                        {sql_query}
 7832                                        AND {table_variants}."#CHROM" LIKE '{chrom}' 
 7833                                    """
 7834                                    log.debug(
 7835                                        f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}' [{num_query}/{len(sql_queries)}]"""
 7836                                    )
 7837                                    # log.debug(f"""sql_query_chrom: {sql_query_chrom}""")
 7838                                    self.execute_query(query=sql_query_chrom)
 7839
 7840                        # Update INFO field
 7841                        log.info(f"""Profile '{profile}' - Update... """)
 7842                        sql_query_update = f"""
 7843                            UPDATE {table_variants}
 7844                            SET INFO =  
 7845                                concat(
 7846                                    CASE
 7847                                        WHEN INFO NOT IN ('','.')
 7848                                        THEN concat(INFO, ';')
 7849                                        ELSE ''
 7850                                    END
 7851                                    {sql_set_info_option}
 7852                                )
 7853                        """
 7854                        # log.debug(f"sql_query_update={sql_query_update}")
 7855                        self.execute_query(query=sql_query_update)
 7856
 7857                        # Remove annotations view for prioritization
 7858                        query_drop_tmp_table = f"""
 7859                            DROP VIEW IF EXISTS {annotation_view_name}
 7860                        """
 7861                        self.execute_query(query=query_drop_tmp_table)
 7862
 7863        else:
 7864
 7865            log.warning(f"No profiles in parameters")
 7866
 7867        # Remove added columns
 7868        for added_column in added_columns:
 7869            self.drop_column(column=added_column)
 7870
 7871        # Explode INFOS fields into table fields
 7872        if self.get_explode_infos():
 7873            self.explode_infos(
 7874                prefix=self.get_explode_infos_prefix(),
 7875                fields=self.get_explode_infos_fields(),
 7876                force=True,
 7877            )
 7878
 7879        return True
 7880
 7881    ###
 7882    # HGVS
 7883    ###
 7884
 7885    def annotation_hgvs(self, threads: int = None) -> None:
 7886        """
 7887        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7888        coordinates and alleles.
 7889
 7890        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7891        threads to use for parallel processing. If no value is provided, it will default to the number
 7892        of threads obtained from the `get_threads()` method
 7893        :type threads: int
 7894        """
 7895
 7896        # Function for each partition of the Dask Dataframe
 7897        def partition_function(partition):
 7898            """
 7899            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7900            each row of a DataFrame called `partition`.
 7901
 7902            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7903            to be processed
 7904            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7905            the "partition" dataframe along the axis 1.
 7906            """
 7907            return partition.apply(annotation_hgvs_partition, axis=1)
 7908
 7909        def annotation_hgvs_partition(row) -> str:
 7910            """
 7911            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7912            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7913
 7914            :param row: A dictionary-like object that contains the values for the following keys:
 7915            :return: a string that contains the HGVS names associated with the given row of data.
 7916            """
 7917
 7918            chr = row["CHROM"]
 7919            pos = row["POS"]
 7920            ref = row["REF"]
 7921            alt = row["ALT"]
 7922
 7923            # Find list of associated transcripts
 7924            transcripts_list = list(
 7925                polars_conn.execute(
 7926                    f"""
 7927                SELECT transcript
 7928                FROM refseq_df
 7929                WHERE CHROM='{chr}'
 7930                AND POS={pos}
 7931            """
 7932                )["transcript"]
 7933            )
 7934
 7935            # Full HGVS annotation in list
 7936            hgvs_full_list = []
 7937
 7938            for transcript_name in transcripts_list:
 7939
 7940                # Transcript
 7941                transcript = get_transcript(
 7942                    transcripts=transcripts, transcript_name=transcript_name
 7943                )
 7944                # Exon
 7945                if use_exon:
 7946                    exon = transcript.find_exon_number(pos)
 7947                else:
 7948                    exon = None
 7949                # Protein
 7950                transcript_protein = None
 7951                if use_protein or add_protein or full_format:
 7952                    transcripts_protein = list(
 7953                        polars_conn.execute(
 7954                            f"""
 7955                        SELECT protein
 7956                        FROM refseqlink_df
 7957                        WHERE transcript='{transcript_name}'
 7958                        LIMIT 1
 7959                    """
 7960                        )["protein"]
 7961                    )
 7962                    if len(transcripts_protein):
 7963                        transcript_protein = transcripts_protein[0]
 7964
 7965                # HGVS name
 7966                hgvs_name = format_hgvs_name(
 7967                    chr,
 7968                    pos,
 7969                    ref,
 7970                    alt,
 7971                    genome=genome,
 7972                    transcript=transcript,
 7973                    transcript_protein=transcript_protein,
 7974                    exon=exon,
 7975                    use_gene=use_gene,
 7976                    use_protein=use_protein,
 7977                    full_format=full_format,
 7978                    use_version=use_version,
 7979                    codon_type=codon_type,
 7980                )
 7981                hgvs_full_list.append(hgvs_name)
 7982                if add_protein and not use_protein and not full_format:
 7983                    hgvs_name = format_hgvs_name(
 7984                        chr,
 7985                        pos,
 7986                        ref,
 7987                        alt,
 7988                        genome=genome,
 7989                        transcript=transcript,
 7990                        transcript_protein=transcript_protein,
 7991                        exon=exon,
 7992                        use_gene=use_gene,
 7993                        use_protein=True,
 7994                        full_format=False,
 7995                        use_version=use_version,
 7996                        codon_type=codon_type,
 7997                    )
 7998                    hgvs_full_list.append(hgvs_name)
 7999
 8000            # Create liste of HGVS annotations
 8001            hgvs_full = ",".join(hgvs_full_list)
 8002
 8003            return hgvs_full
 8004
 8005        # Polars connexion
 8006        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8007
 8008        # Config
 8009        config = self.get_config()
 8010
 8011        # Databases
 8012        # Genome
 8013        databases_genomes_folders = (
 8014            config.get("folders", {})
 8015            .get("databases", {})
 8016            .get("genomes", DEFAULT_GENOME_FOLDER)
 8017        )
 8018        databases_genome = (
 8019            config.get("folders", {}).get("databases", {}).get("genomes", "")
 8020        )
 8021        # refseq database folder
 8022        databases_refseq_folders = (
 8023            config.get("folders", {})
 8024            .get("databases", {})
 8025            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 8026        )
 8027        # refseq
 8028        databases_refseq = config.get("databases", {}).get("refSeq", None)
 8029        # refSeqLink
 8030        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 8031
 8032        # Param
 8033        param = self.get_param()
 8034
 8035        # Quick HGVS
 8036        if "hgvs_options" in param and param.get("hgvs_options", ""):
 8037            log.info(f"Quick HGVS Annotation:")
 8038            if not param.get("hgvs", None):
 8039                param["hgvs"] = {}
 8040            for option in param.get("hgvs_options", "").split(","):
 8041                option_var_val = option.split("=")
 8042                option_var = option_var_val[0]
 8043                if len(option_var_val) > 1:
 8044                    option_val = option_var_val[1]
 8045                else:
 8046                    option_val = "True"
 8047                if option_val.upper() in ["TRUE"]:
 8048                    option_val = True
 8049                elif option_val.upper() in ["FALSE"]:
 8050                    option_val = False
 8051                log.info(f"   {option_var}={option_val}")
 8052                param["hgvs"][option_var] = option_val
 8053
 8054        # Check if HGVS annotation enabled
 8055        if "hgvs" in param:
 8056            log.info(f"HGVS Annotation... ")
 8057            for hgvs_option in param.get("hgvs", {}):
 8058                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 8059        else:
 8060            return
 8061
 8062        # HGVS Param
 8063        param_hgvs = param.get("hgvs", {})
 8064        use_exon = param_hgvs.get("use_exon", False)
 8065        use_gene = param_hgvs.get("use_gene", False)
 8066        use_protein = param_hgvs.get("use_protein", False)
 8067        add_protein = param_hgvs.get("add_protein", False)
 8068        full_format = param_hgvs.get("full_format", False)
 8069        use_version = param_hgvs.get("use_version", False)
 8070        codon_type = param_hgvs.get("codon_type", "3")
 8071
 8072        # refSseq refSeqLink
 8073        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8074        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8075
 8076        # Assembly
 8077        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8078
 8079        # Genome
 8080        genome_file = None
 8081        if find_genome(databases_genome):
 8082            genome_file = find_genome(databases_genome)
 8083        else:
 8084            genome_file = find_genome(
 8085                genome_path=databases_genomes_folders, assembly=assembly
 8086            )
 8087        log.debug("Genome: " + str(genome_file))
 8088
 8089        # refSseq
 8090        refseq_file = find_file_prefix(
 8091            input_file=databases_refseq,
 8092            prefix="ncbiRefSeq",
 8093            folder=databases_refseq_folders,
 8094            assembly=assembly,
 8095        )
 8096        log.debug("refSeq: " + str(refseq_file))
 8097
 8098        # refSeqLink
 8099        refseqlink_file = find_file_prefix(
 8100            input_file=databases_refseqlink,
 8101            prefix="ncbiRefSeqLink",
 8102            folder=databases_refseq_folders,
 8103            assembly=assembly,
 8104        )
 8105        log.debug("refSeqLink: " + str(refseqlink_file))
 8106
 8107        # Threads
 8108        if not threads:
 8109            threads = self.get_threads()
 8110        log.debug("Threads: " + str(threads))
 8111
 8112        # Variables
 8113        table_variants = self.get_table_variants(clause="update")
 8114
 8115        # Get variants SNV and InDel only
 8116        query_variants = f"""
 8117            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8118            FROM {table_variants}
 8119            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8120            """
 8121        df_variants = self.get_query_to_df(query_variants)
 8122
 8123        # Added columns
 8124        added_columns = []
 8125
 8126        # Add hgvs column in variants table
 8127        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8128        added_column = self.add_column(
 8129            table_variants, hgvs_column_name, "STRING", default_value=None
 8130        )
 8131        added_columns.append(added_column)
 8132
 8133        log.debug(f"refSeq loading...")
 8134        # refSeq in duckDB
 8135        refseq_table = get_refseq_table(
 8136            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8137        )
 8138        # Loading all refSeq in Dataframe
 8139        refseq_query = f"""
 8140            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8141            FROM {refseq_table}
 8142            JOIN df_variants ON (
 8143                {refseq_table}.chrom = df_variants.CHROM
 8144                AND {refseq_table}.txStart<=df_variants.POS
 8145                AND {refseq_table}.txEnd>=df_variants.POS
 8146            )
 8147        """
 8148        refseq_df = self.conn.query(refseq_query).pl()
 8149
 8150        if refseqlink_file:
 8151            log.debug(f"refSeqLink loading...")
 8152            # refSeqLink in duckDB
 8153            refseqlink_table = get_refseq_table(
 8154                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8155            )
 8156            # Loading all refSeqLink in Dataframe
 8157            protacc_column = "protAcc_with_ver"
 8158            mrnaacc_column = "mrnaAcc_with_ver"
 8159            refseqlink_query = f"""
 8160                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8161                FROM {refseqlink_table} 
 8162                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8163                WHERE protAcc_without_ver IS NOT NULL
 8164            """
 8165            # Polars Dataframe
 8166            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8167
 8168        # Read RefSeq transcripts into a python dict/model.
 8169        log.debug(f"Transcripts loading...")
 8170        with tempfile.TemporaryDirectory() as tmpdir:
 8171            transcripts_query = f"""
 8172                COPY (
 8173                    SELECT {refseq_table}.*
 8174                    FROM {refseq_table}
 8175                    JOIN df_variants ON (
 8176                        {refseq_table}.chrom=df_variants.CHROM
 8177                        AND {refseq_table}.txStart<=df_variants.POS
 8178                        AND {refseq_table}.txEnd>=df_variants.POS
 8179                    )
 8180                )
 8181                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8182            """
 8183            self.conn.query(transcripts_query)
 8184            with open(f"{tmpdir}/transcript.tsv") as infile:
 8185                transcripts = read_transcripts(infile)
 8186
 8187        # Polars connexion
 8188        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8189
 8190        log.debug("Genome loading...")
 8191        # Read genome sequence using pyfaidx.
 8192        genome = Fasta(genome_file)
 8193
 8194        log.debug("Start annotation HGVS...")
 8195
 8196        # Create
 8197        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8198        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8199
 8200        # Use dask.dataframe.apply() to apply function on each partition
 8201        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8202
 8203        # Convert Dask DataFrame to Pandas Dataframe
 8204        df = ddf.compute()
 8205
 8206        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8207        with tempfile.TemporaryDirectory() as tmpdir:
 8208            df_parquet = os.path.join(tmpdir, "df.parquet")
 8209            df.to_parquet(df_parquet)
 8210
 8211            # Update hgvs column
 8212            update_variant_query = f"""
 8213                UPDATE {table_variants}
 8214                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8215                FROM read_parquet('{df_parquet}') as df
 8216                WHERE variants."#CHROM" = df.CHROM
 8217                AND variants.POS = df.POS
 8218                AND variants.REF = df.REF
 8219                AND variants.ALT = df.ALT
 8220                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8221                """
 8222            self.execute_query(update_variant_query)
 8223
 8224        # Update INFO column
 8225        sql_query_update = f"""
 8226            UPDATE {table_variants}
 8227            SET INFO = 
 8228                concat(
 8229                    CASE 
 8230                        WHEN INFO NOT IN ('','.')
 8231                        THEN concat(INFO, ';')
 8232                        ELSE ''
 8233                    END,
 8234                    'hgvs=',
 8235                    {hgvs_column_name}
 8236                )
 8237            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8238            """
 8239        self.execute_query(sql_query_update)
 8240
 8241        # Add header
 8242        HGVS_INFOS = {
 8243            "hgvs": {
 8244                "ID": "hgvs",
 8245                "Number": ".",
 8246                "Type": "String",
 8247                "Description": f"HGVS annotatation with HOWARD",
 8248            }
 8249        }
 8250
 8251        for field in HGVS_INFOS:
 8252            field_ID = HGVS_INFOS[field]["ID"]
 8253            field_description = HGVS_INFOS[field]["Description"]
 8254            self.get_header().infos[field_ID] = vcf.parser._Info(
 8255                field_ID,
 8256                HGVS_INFOS[field]["Number"],
 8257                HGVS_INFOS[field]["Type"],
 8258                field_description,
 8259                "unknown",
 8260                "unknown",
 8261                code_type_map[HGVS_INFOS[field]["Type"]],
 8262            )
 8263
 8264        # Remove added columns
 8265        for added_column in added_columns:
 8266            self.drop_column(column=added_column)
 8267
 8268    ###
 8269    # Calculation
 8270    ###
 8271
 8272    def get_operations_help(
 8273        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8274    ) -> list:
 8275
 8276        # Init
 8277        operations_help = []
 8278
 8279        # operations
 8280        operations = self.get_config_json(
 8281            name="calculations",
 8282            config_dict=operations_config_dict,
 8283            config_file=operations_config_file,
 8284        )
 8285        for op in operations:
 8286            op_name = operations[op].get("name", op).upper()
 8287            op_description = operations[op].get("description", op_name)
 8288            op_available = operations[op].get("available", False)
 8289            if op_available:
 8290                operations_help.append(f"   {op_name}: {op_description}")
 8291
 8292        # Sort operations
 8293        operations_help.sort()
 8294
 8295        # insert header
 8296        operations_help.insert(0, "Available calculation operations:")
 8297
 8298        # Return
 8299        return operations_help
 8300
 8301    def calculation(
 8302        self,
 8303        operations: dict = {},
 8304        operations_config_dict: dict = {},
 8305        operations_config_file: str = None,
 8306    ) -> None:
 8307        """
 8308        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8309        operation, and then calls the appropriate function
 8310
 8311        param json example:
 8312            "calculation": {
 8313                "NOMEN": {
 8314                    "options": {
 8315                        "hgvs_field": "hgvs"
 8316                    },
 8317                "middle" : null
 8318            }
 8319        """
 8320
 8321        # Param
 8322        param = self.get_param()
 8323
 8324        # CHeck operations config file
 8325        if operations_config_file is None:
 8326            operations_config_file = param.get("calculation", {}).get(
 8327                "calculation_config", None
 8328            )
 8329
 8330        # operations config
 8331        operations_config = self.get_config_json(
 8332            name="calculations",
 8333            config_dict=operations_config_dict,
 8334            config_file=operations_config_file,
 8335        )
 8336
 8337        # Upper keys
 8338        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8339
 8340        # Calculations
 8341
 8342        # Operations from param
 8343        operations = param.get("calculation", {}).get("calculations", operations)
 8344
 8345        # Quick calculation - add
 8346        if param.get("calculations", None):
 8347
 8348            # List of operations
 8349            calculations_list = [
 8350                value.strip() for value in param.get("calculations", "").split(",")
 8351            ]
 8352
 8353            # Log
 8354            log.info(f"Quick Calculations:")
 8355            for calculation_key in calculations_list:
 8356                log.info(f"   {calculation_key}")
 8357
 8358            # Create tmp operations (to keep operation order)
 8359            operations_tmp = {}
 8360            for calculation_operation in calculations_list:
 8361                if calculation_operation.upper() not in operations_tmp:
 8362                    log.debug(
 8363                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8364                    )
 8365                    operations_tmp[calculation_operation.upper()] = {}
 8366                    add_value_into_dict(
 8367                        dict_tree=operations_tmp,
 8368                        sections=[
 8369                            calculation_operation.upper(),
 8370                        ],
 8371                        value=operations.get(calculation_operation.upper(), {}),
 8372                    )
 8373            # Add operations already in param
 8374            for calculation_operation in operations:
 8375                if calculation_operation not in operations_tmp:
 8376                    operations_tmp[calculation_operation] = operations.get(
 8377                        calculation_operation, {}
 8378                    )
 8379
 8380            # Update operations in param
 8381            operations = operations_tmp
 8382
 8383        # Operations for calculation
 8384        if not operations:
 8385            operations = param.get("calculation", {}).get("calculations", {})
 8386
 8387        if operations:
 8388            log.info(f"Calculations...")
 8389
 8390        # For each operations
 8391        for operation_name in operations:
 8392            operation_name = operation_name.upper()
 8393            if operation_name not in [""]:
 8394                if operation_name in operations_config:
 8395                    log.info(f"Calculation '{operation_name}'")
 8396                    operation = operations_config[operation_name]
 8397                    operation_type = operation.get("type", "sql")
 8398                    if operation_type == "python":
 8399                        self.calculation_process_function(
 8400                            operation=operation, operation_name=operation_name
 8401                        )
 8402                    elif operation_type == "sql":
 8403                        self.calculation_process_sql(
 8404                            operation=operation, operation_name=operation_name
 8405                        )
 8406                    else:
 8407                        log.error(
 8408                            f"Operations config: Type '{operation_type}' NOT available"
 8409                        )
 8410                        raise ValueError(
 8411                            f"Operations config: Type '{operation_type}' NOT available"
 8412                        )
 8413                else:
 8414                    log.error(
 8415                        f"Operations config: Calculation '{operation_name}' NOT available"
 8416                    )
 8417                    raise ValueError(
 8418                        f"Operations config: Calculation '{operation_name}' NOT available"
 8419                    )
 8420
 8421        # Explode INFOS fields into table fields
 8422        if self.get_explode_infos():
 8423            self.explode_infos(
 8424                prefix=self.get_explode_infos_prefix(),
 8425                fields=self.get_explode_infos_fields(),
 8426                force=True,
 8427            )
 8428
 8429    def calculation_process_sql(
 8430        self, operation: dict, operation_name: str = "unknown"
 8431    ) -> None:
 8432        """
 8433        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8434        performs the operation, updating the specified table with the result.
 8435
 8436        :param operation: The `operation` parameter is a dictionary that contains information about the
 8437        mathematical operation to be performed. It includes the following keys:
 8438        :type operation: dict
 8439        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8440        the mathematical operation being performed. It is used for logging and error handling purposes,
 8441        defaults to unknown
 8442        :type operation_name: str (optional)
 8443        """
 8444
 8445        # Operation infos
 8446        operation_name = operation.get("name", "unknown")
 8447        log.debug(f"process SQL {operation_name}")
 8448        output_column_name = operation.get("output_column_name", operation_name)
 8449        output_column_type = operation.get("output_column_type", "String")
 8450        prefix = operation.get("explode_infos_prefix", "")
 8451        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8452        output_column_description = operation.get(
 8453            "output_column_description", f"{operation_name} operation"
 8454        )
 8455        operation_query = operation.get("operation_query", None)
 8456        if isinstance(operation_query, list):
 8457            operation_query = " ".join(operation_query)
 8458        operation_info_fields = operation.get("info_fields", [])
 8459        operation_info_fields_check = operation.get("info_fields_check", False)
 8460        operation_info = operation.get("operation_info", True)
 8461        operation_table = operation.get(
 8462            "table", self.get_table_variants(clause="alter")
 8463        )
 8464
 8465        # table variants
 8466        if operation_table:
 8467            table_variants = operation_table
 8468        else:
 8469            table_variants = self.get_table_variants(clause="alter")
 8470
 8471        if operation_query:
 8472
 8473            # Info fields check
 8474            operation_info_fields_check_result = True
 8475            if operation_info_fields_check:
 8476                header_infos = self.get_header().infos
 8477                for info_field in operation_info_fields:
 8478                    operation_info_fields_check_result = (
 8479                        operation_info_fields_check_result
 8480                        and info_field in header_infos
 8481                    )
 8482
 8483            # If info fields available
 8484            if operation_info_fields_check_result:
 8485
 8486                # Added_columns
 8487                added_columns = []
 8488
 8489                # Create VCF header field
 8490                vcf_reader = self.get_header()
 8491                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8492                    output_column_name,
 8493                    ".",
 8494                    output_column_type,
 8495                    output_column_description,
 8496                    "howard calculation",
 8497                    "0",
 8498                    self.code_type_map.get(output_column_type),
 8499                )
 8500
 8501                # Explode infos if needed
 8502                log.debug(f"calculation_process_sql prefix {prefix}")
 8503                added_columns += self.explode_infos(
 8504                    prefix=prefix,
 8505                    fields=[output_column_name] + operation_info_fields,
 8506                    force=False,
 8507                    table=table_variants,
 8508                )
 8509
 8510                # Create column
 8511                added_column = self.add_column(
 8512                    table_name=table_variants,
 8513                    column_name=prefix + output_column_name,
 8514                    column_type=output_column_type_sql,
 8515                    default_value="null",
 8516                )
 8517                added_columns.append(added_column)
 8518
 8519                # Operation calculation
 8520                try:
 8521
 8522                    # Query to update calculation column
 8523                    sql_update = f"""
 8524                        UPDATE {table_variants}
 8525                        SET "{prefix}{output_column_name}" = ({operation_query})
 8526                    """
 8527                    self.conn.execute(sql_update)
 8528
 8529                    # Add to INFO
 8530                    if operation_info:
 8531                        sql_update_info = f"""
 8532                            UPDATE {table_variants}
 8533                            SET "INFO" =
 8534                                concat(
 8535                                    CASE
 8536                                        WHEN "INFO" IS NOT NULL
 8537                                        THEN concat("INFO", ';')
 8538                                        ELSE ''
 8539                                    END,
 8540                                    '{output_column_name}=',
 8541                                    "{prefix}{output_column_name}"
 8542                                )
 8543                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8544                        """
 8545                        self.conn.execute(sql_update_info)
 8546
 8547                except:
 8548                    log.error(
 8549                        f"Operations config: Calculation '{operation_name}' query failed"
 8550                    )
 8551                    raise ValueError(
 8552                        f"Operations config: Calculation '{operation_name}' query failed"
 8553                    )
 8554
 8555                # Remove added columns
 8556                for added_column in added_columns:
 8557                    log.debug(f"added_column: {added_column}")
 8558                    self.drop_column(column=added_column)
 8559
 8560            else:
 8561                log.error(
 8562                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8563                )
 8564                raise ValueError(
 8565                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8566                )
 8567
 8568        else:
 8569            log.error(
 8570                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8571            )
 8572            raise ValueError(
 8573                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8574            )
 8575
 8576    def calculation_process_function(
 8577        self, operation: dict, operation_name: str = "unknown"
 8578    ) -> None:
 8579        """
 8580        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8581        function with the given parameters.
 8582
 8583        :param operation: The `operation` parameter is a dictionary that contains information about the
 8584        operation to be performed. It has the following keys:
 8585        :type operation: dict
 8586        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8587        the operation being performed. It is used for logging purposes, defaults to unknown
 8588        :type operation_name: str (optional)
 8589        """
 8590
 8591        operation_name = operation["name"]
 8592        log.debug(f"process Python {operation_name}")
 8593        function_name = operation["function_name"]
 8594        function_params = operation["function_params"]
 8595        getattr(self, function_name)(*function_params)
 8596
 8597    def calculation_variant_id(self) -> None:
 8598        """
 8599        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8600        updates the INFO field of a variants table with the variant ID.
 8601        """
 8602
 8603        # variant_id annotation field
 8604        variant_id_tag = self.get_variant_id_column()
 8605        added_columns = [variant_id_tag]
 8606
 8607        # variant_id hgvs tags"
 8608        vcf_infos_tags = {
 8609            variant_id_tag: "howard variant ID annotation",
 8610        }
 8611
 8612        # Variants table
 8613        table_variants = self.get_table_variants()
 8614
 8615        # Header
 8616        vcf_reader = self.get_header()
 8617
 8618        # Add variant_id to header
 8619        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8620            variant_id_tag,
 8621            ".",
 8622            "String",
 8623            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8624            "howard calculation",
 8625            "0",
 8626            self.code_type_map.get("String"),
 8627        )
 8628
 8629        # Update
 8630        sql_update = f"""
 8631            UPDATE {table_variants}
 8632            SET "INFO" = 
 8633                concat(
 8634                    CASE
 8635                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8636                        THEN ''
 8637                        ELSE concat("INFO", ';')
 8638                    END,
 8639                    '{variant_id_tag}=',
 8640                    "{variant_id_tag}"
 8641                )
 8642        """
 8643        self.conn.execute(sql_update)
 8644
 8645        # Remove added columns
 8646        for added_column in added_columns:
 8647            self.drop_column(column=added_column)
 8648
 8649    def calculation_extract_snpeff_hgvs(
 8650        self,
 8651        snpeff_hgvs: str = "snpeff_hgvs",
 8652        snpeff_field: str = "ANN",
 8653    ) -> None:
 8654        """
 8655        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8656        annotation field in a VCF file and adds them as a new column in the variants table.
 8657
 8658        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8659        function is used to specify the name of the column that will store the HGVS nomenclatures
 8660        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8661        snpeff_hgvs
 8662        :type snpeff_hgvs: str (optional)
 8663        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8664        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8665        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8666        to ANN
 8667        :type snpeff_field: str (optional)
 8668        """
 8669
 8670        # Snpeff hgvs tags
 8671        vcf_infos_tags = {
 8672            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8673        }
 8674
 8675        # Prefix
 8676        prefix = self.get_explode_infos_prefix()
 8677        if prefix:
 8678            prefix = "INFO/"
 8679
 8680        # snpEff fields
 8681        speff_ann_infos = prefix + snpeff_field
 8682        speff_hgvs_infos = prefix + snpeff_hgvs
 8683
 8684        # Variants table
 8685        table_variants = self.get_table_variants()
 8686
 8687        # Header
 8688        vcf_reader = self.get_header()
 8689
 8690        # Add columns
 8691        added_columns = []
 8692
 8693        # Explode HGVS field in column
 8694        added_columns += self.explode_infos(fields=[snpeff_field])
 8695
 8696        if snpeff_field in vcf_reader.infos:
 8697
 8698            log.debug(vcf_reader.infos[snpeff_field])
 8699
 8700            # Extract ANN header
 8701            ann_description = vcf_reader.infos[snpeff_field].desc
 8702            pattern = r"'(.+?)'"
 8703            match = re.search(pattern, ann_description)
 8704            if match:
 8705                ann_header_match = match.group(1).split(" | ")
 8706                ann_header_desc = {}
 8707                for i in range(len(ann_header_match)):
 8708                    ann_header_info = "".join(
 8709                        char for char in ann_header_match[i] if char.isalnum()
 8710                    )
 8711                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8712                if not ann_header_desc:
 8713                    raise ValueError("Invalid header description format")
 8714            else:
 8715                raise ValueError("Invalid header description format")
 8716
 8717            # Create variant id
 8718            variant_id_column = self.get_variant_id_column()
 8719            added_columns += [variant_id_column]
 8720
 8721            # Create dataframe
 8722            dataframe_snpeff_hgvs = self.get_query_to_df(
 8723                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8724            )
 8725
 8726            # Create main NOMEN column
 8727            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8728                speff_ann_infos
 8729            ].apply(
 8730                lambda x: extract_snpeff_hgvs(
 8731                    str(x), header=list(ann_header_desc.values())
 8732                )
 8733            )
 8734
 8735            # Add snpeff_hgvs to header
 8736            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8737                snpeff_hgvs,
 8738                ".",
 8739                "String",
 8740                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8741                "howard calculation",
 8742                "0",
 8743                self.code_type_map.get("String"),
 8744            )
 8745
 8746            # Update
 8747            sql_update = f"""
 8748                UPDATE variants
 8749                SET "INFO" = 
 8750                    concat(
 8751                        CASE
 8752                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8753                            THEN ''
 8754                            ELSE concat("INFO", ';')
 8755                        END,
 8756                        CASE 
 8757                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8758                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8759                            THEN concat(
 8760                                    '{snpeff_hgvs}=',
 8761                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8762                                )
 8763                            ELSE ''
 8764                        END
 8765                    )
 8766                FROM dataframe_snpeff_hgvs
 8767                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8768
 8769            """
 8770            self.conn.execute(sql_update)
 8771
 8772            # Delete dataframe
 8773            del dataframe_snpeff_hgvs
 8774            gc.collect()
 8775
 8776        else:
 8777
 8778            log.warning(
 8779                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8780            )
 8781
 8782        # Remove added columns
 8783        for added_column in added_columns:
 8784            self.drop_column(column=added_column)
 8785
 8786    def calculation_snpeff_ann_explode(
 8787        self,
 8788        uniquify: bool = True,
 8789        output_format: str = "fields",
 8790        output_prefix: str = "snpeff_",
 8791        snpeff_field: str = "ANN",
 8792    ) -> None:
 8793        """
 8794        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8795        exploding the HGVS field and updating variant information accordingly.
 8796
 8797        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8798        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8799        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8800        defaults to True
 8801        :type uniquify: bool (optional)
 8802        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8803        function specifies the format in which the output annotations will be generated. It has a
 8804        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8805        format, defaults to fields
 8806        :type output_format: str (optional)
 8807        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8808        method is used to specify the prefix that will be added to the output annotations generated
 8809        during the calculation process. This prefix helps to differentiate the newly added annotations
 8810        from existing ones in the output data. By default, the, defaults to ANN_
 8811        :type output_prefix: str (optional)
 8812        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8813        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8814        field will be processed to explode the HGVS annotations and update the variant information
 8815        accordingly, defaults to ANN
 8816        :type snpeff_field: str (optional)
 8817        """
 8818
 8819        # SnpEff annotation field
 8820        snpeff_hgvs = "snpeff_ann_explode"
 8821
 8822        # Snpeff hgvs tags
 8823        vcf_infos_tags = {
 8824            snpeff_hgvs: "Explode snpEff annotations",
 8825        }
 8826
 8827        # Prefix
 8828        prefix = self.get_explode_infos_prefix()
 8829        if prefix:
 8830            prefix = "INFO/"
 8831
 8832        # snpEff fields
 8833        speff_ann_infos = prefix + snpeff_field
 8834        speff_hgvs_infos = prefix + snpeff_hgvs
 8835
 8836        # Variants table
 8837        table_variants = self.get_table_variants()
 8838
 8839        # Header
 8840        vcf_reader = self.get_header()
 8841
 8842        # Add columns
 8843        added_columns = []
 8844
 8845        # Explode HGVS field in column
 8846        added_columns += self.explode_infos(fields=[snpeff_field])
 8847        log.debug(f"snpeff_field={snpeff_field}")
 8848        log.debug(f"added_columns={added_columns}")
 8849
 8850        if snpeff_field in vcf_reader.infos:
 8851
 8852            # Extract ANN header
 8853            ann_description = vcf_reader.infos[snpeff_field].desc
 8854            pattern = r"'(.+?)'"
 8855            match = re.search(pattern, ann_description)
 8856            if match:
 8857                ann_header_match = match.group(1).split(" | ")
 8858                ann_header = []
 8859                ann_header_desc = {}
 8860                for i in range(len(ann_header_match)):
 8861                    ann_header_info = "".join(
 8862                        char for char in ann_header_match[i] if char.isalnum()
 8863                    )
 8864                    ann_header.append(ann_header_info)
 8865                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8866                if not ann_header_desc:
 8867                    raise ValueError("Invalid header description format")
 8868            else:
 8869                raise ValueError("Invalid header description format")
 8870
 8871            # Create variant id
 8872            variant_id_column = self.get_variant_id_column()
 8873            added_columns += [variant_id_column]
 8874
 8875            # Create dataframe
 8876            dataframe_snpeff_hgvs = self.get_query_to_df(
 8877                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8878            )
 8879
 8880            # Create snpEff columns
 8881            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8882                speff_ann_infos
 8883            ].apply(
 8884                lambda x: explode_snpeff_ann(
 8885                    str(x),
 8886                    uniquify=uniquify,
 8887                    output_format=output_format,
 8888                    prefix=output_prefix,
 8889                    header=list(ann_header_desc.values()),
 8890                )
 8891            )
 8892
 8893            # Header
 8894            ann_annotations_prefix = ""
 8895            if output_format.upper() in ["JSON"]:
 8896                ann_annotations_prefix = f"{output_prefix}="
 8897                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8898                    output_prefix,
 8899                    ".",
 8900                    "String",
 8901                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8902                    + " - JSON format",
 8903                    "howard calculation",
 8904                    "0",
 8905                    self.code_type_map.get("String"),
 8906                )
 8907            else:
 8908                for ann_annotation in ann_header:
 8909                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8910                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8911                        ann_annotation_id,
 8912                        ".",
 8913                        "String",
 8914                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8915                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8916                        "howard calculation",
 8917                        "0",
 8918                        self.code_type_map.get("String"),
 8919                    )
 8920
 8921            # Update
 8922            sql_update = f"""
 8923                UPDATE variants
 8924                SET "INFO" = 
 8925                    concat(
 8926                        CASE
 8927                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8928                            THEN ''
 8929                            ELSE concat("INFO", ';')
 8930                        END,
 8931                        CASE 
 8932                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8933                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8934                            THEN concat(
 8935                                '{ann_annotations_prefix}',
 8936                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8937                                )
 8938                            ELSE ''
 8939                        END
 8940                    )
 8941                FROM dataframe_snpeff_hgvs
 8942                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8943
 8944            """
 8945            self.conn.execute(sql_update)
 8946
 8947            # Delete dataframe
 8948            del dataframe_snpeff_hgvs
 8949            gc.collect()
 8950
 8951        else:
 8952
 8953            log.warning(
 8954                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8955            )
 8956
 8957        # Remove added columns
 8958        for added_column in added_columns:
 8959            self.drop_column(column=added_column)
 8960
 8961    def calculation_extract_nomen(self) -> None:
 8962        """
 8963        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8964        """
 8965
 8966        # NOMEN field
 8967        field_nomen_dict = "NOMEN_DICT"
 8968
 8969        # NOMEN structure
 8970        nomen_dict = {
 8971            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8972            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8973            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8974            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8975            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8976            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8977            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8978            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8979            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8980            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8981        }
 8982
 8983        # Param
 8984        param = self.get_param()
 8985
 8986        # Threads
 8987        threads = self.get_threads()
 8988
 8989        # Prefix
 8990        prefix = self.get_explode_infos_prefix()
 8991
 8992        # Header
 8993        vcf_reader = self.get_header()
 8994
 8995        # Added columns
 8996        added_columns = []
 8997
 8998        # Get HGVS field
 8999        hgvs_field = (
 9000            param.get("calculation", {})
 9001            .get("calculations", {})
 9002            .get("NOMEN", {})
 9003            .get("options", {})
 9004            .get("hgvs_field", "hgvs")
 9005        )
 9006
 9007        # Get NOMEN pattern
 9008        nomen_pattern = (
 9009            param.get("calculation", {})
 9010            .get("calculations", {})
 9011            .get("NOMEN", {})
 9012            .get("options", {})
 9013            .get("pattern", None)
 9014        )
 9015
 9016        # transcripts list of preference sources
 9017        transcripts_sources = {}
 9018
 9019        # Get transcripts
 9020        transcripts_file = (
 9021            param.get("calculation", {})
 9022            .get("calculations", {})
 9023            .get("NOMEN", {})
 9024            .get("options", {})
 9025            .get("transcripts", None)
 9026        )
 9027        transcripts_file = full_path(transcripts_file)
 9028        if transcripts_file:
 9029            if os.path.exists(transcripts_file):
 9030                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 9031                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 9032                transcripts_sources["file"] = transcripts_from_file
 9033            else:
 9034                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 9035                log.error(msg_err)
 9036                raise ValueError(msg_err)
 9037
 9038        # Get transcripts table
 9039        transcripts_table = (
 9040            param.get("calculation", {})
 9041            .get("calculations", {})
 9042            .get("NOMEN", {})
 9043            .get("options", {})
 9044            .get("transcripts_table", self.get_table_variants())
 9045        )
 9046        # Get transcripts column
 9047        transcripts_column = (
 9048            param.get("calculation", {})
 9049            .get("calculations", {})
 9050            .get("NOMEN", {})
 9051            .get("options", {})
 9052            .get("transcripts_column", None)
 9053        )
 9054
 9055        if transcripts_table and transcripts_column:
 9056            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 9057            # Explode if not exists
 9058            added_columns += self.explode_infos(
 9059                fields=[transcripts_column], table=transcripts_table
 9060            )
 9061        else:
 9062            extra_field_transcript = f"NULL"
 9063
 9064        # Transcripts of preference source order
 9065        transcripts_order = (
 9066            param.get("calculation", {})
 9067            .get("calculations", {})
 9068            .get("NOMEN", {})
 9069            .get("options", {})
 9070            .get("transcripts_order", ["column", "file"])
 9071        )
 9072
 9073        # Transcripts from file
 9074        transcripts = transcripts_sources.get("file", [])
 9075
 9076        # Explode HGVS field in column
 9077        added_columns += self.explode_infos(fields=[hgvs_field])
 9078
 9079        # extra infos
 9080        extra_infos = self.get_extra_infos()
 9081        extra_field = prefix + hgvs_field
 9082
 9083        if extra_field in extra_infos:
 9084
 9085            # Create dataframe
 9086            dataframe_hgvs = self.get_query_to_df(
 9087                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9088            )
 9089
 9090            # Transcripts rank
 9091            transcripts_rank = {
 9092                transcript: rank for rank, transcript in enumerate(transcripts, start=1)
 9093            }
 9094            transcripts_len = len(transcripts_rank)
 9095
 9096            # Create main NOMEN column
 9097            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9098                lambda x: find_nomen(
 9099                    hgvs=x.hgvs,
 9100                    transcript=x.transcript,
 9101                    transcripts=transcripts_rank,
 9102                    pattern=nomen_pattern,
 9103                    transcripts_source_order=transcripts_order,
 9104                    transcripts_len=transcripts_len,
 9105                ),
 9106                axis=1,
 9107            )
 9108
 9109            # Explode NOMEN Structure and create SQL set for update
 9110            sql_nomen_fields = []
 9111            for nomen_field in nomen_dict:
 9112
 9113                # Create VCF header field
 9114                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9115                    nomen_field,
 9116                    ".",
 9117                    "String",
 9118                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9119                    "howard calculation",
 9120                    "0",
 9121                    self.code_type_map.get("String"),
 9122                )
 9123
 9124                # Add field to SQL query update
 9125                sql_nomen_fields.append(
 9126                    f"""
 9127                        CASE 
 9128                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
 9129                            THEN concat(
 9130                                    ';{nomen_field}=',
 9131                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
 9132                                )
 9133                            ELSE ''
 9134                        END
 9135                    """
 9136                )
 9137
 9138            # SQL set for update
 9139            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9140
 9141            # Update
 9142            sql_update = f"""
 9143                UPDATE variants
 9144                SET "INFO" = 
 9145                    concat(
 9146                        CASE
 9147                            WHEN "INFO" IS NULL
 9148                            THEN ''
 9149                            ELSE "INFO"
 9150                        END,
 9151                        {sql_nomen_fields_set}
 9152                    )
 9153                FROM dataframe_hgvs
 9154                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9155                    AND variants."POS" = dataframe_hgvs."POS" 
 9156                    AND variants."REF" = dataframe_hgvs."REF"
 9157                    AND variants."ALT" = dataframe_hgvs."ALT"
 9158            """
 9159            self.conn.execute(sql_update)
 9160
 9161            # Delete dataframe
 9162            del dataframe_hgvs
 9163            gc.collect()
 9164
 9165        # Remove added columns
 9166        for added_column in added_columns:
 9167            self.drop_column(column=added_column)
 9168
 9169    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9170        """
 9171        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9172        pipeline/sample for a variant and updates the variant information in a VCF file.
 9173
 9174        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9175        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9176        VCF header and to update the corresponding field in the variants table, defaults to
 9177        findbypipeline
 9178        :type tag: str (optional)
 9179        """
 9180
 9181        # if FORMAT and samples
 9182        if (
 9183            "FORMAT" in self.get_header_columns_as_list()
 9184            and self.get_header_sample_list()
 9185        ):
 9186
 9187            # findbypipeline annotation field
 9188            findbypipeline_tag = tag
 9189
 9190            # VCF infos tags
 9191            vcf_infos_tags = {
 9192                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9193            }
 9194
 9195            # Prefix
 9196            prefix = self.get_explode_infos_prefix()
 9197
 9198            # Field
 9199            findbypipeline_infos = prefix + findbypipeline_tag
 9200
 9201            # Variants table
 9202            table_variants = self.get_table_variants()
 9203
 9204            # Header
 9205            vcf_reader = self.get_header()
 9206
 9207            # Create variant id
 9208            variant_id_column = self.get_variant_id_column()
 9209            added_columns = [variant_id_column]
 9210
 9211            # variant_id, FORMAT and samples
 9212            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9213                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9214            )
 9215
 9216            # Create dataframe
 9217            dataframe_findbypipeline = self.get_query_to_df(
 9218                f""" SELECT {samples_fields} FROM {table_variants} """
 9219            )
 9220
 9221            # Create findbypipeline column
 9222            dataframe_findbypipeline[findbypipeline_infos] = (
 9223                dataframe_findbypipeline.apply(
 9224                    lambda row: findbypipeline(
 9225                        row, samples=self.get_header_sample_list()
 9226                    ),
 9227                    axis=1,
 9228                )
 9229            )
 9230
 9231            # Add snpeff_hgvs to header
 9232            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9233                findbypipeline_tag,
 9234                ".",
 9235                "String",
 9236                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9237                "howard calculation",
 9238                "0",
 9239                self.code_type_map.get("String"),
 9240            )
 9241
 9242            # Update
 9243            sql_update = f"""
 9244                UPDATE variants
 9245                SET "INFO" = 
 9246                    concat(
 9247                        CASE
 9248                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9249                            THEN ''
 9250                            ELSE concat("INFO", ';')
 9251                        END,
 9252                        CASE 
 9253                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9254                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9255                            THEN concat(
 9256                                    '{findbypipeline_tag}=',
 9257                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9258                                )
 9259                            ELSE ''
 9260                        END
 9261                    )
 9262                FROM dataframe_findbypipeline
 9263                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9264            """
 9265            self.conn.execute(sql_update)
 9266
 9267            # Remove added columns
 9268            for added_column in added_columns:
 9269                self.drop_column(column=added_column)
 9270
 9271            # Delete dataframe
 9272            del dataframe_findbypipeline
 9273            gc.collect()
 9274
 9275    def calculation_genotype_concordance(self) -> None:
 9276        """
 9277        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9278        multi-caller VCF files and updates the variant information in the database.
 9279        """
 9280
 9281        # if FORMAT and samples
 9282        if (
 9283            "FORMAT" in self.get_header_columns_as_list()
 9284            and self.get_header_sample_list()
 9285        ):
 9286
 9287            # genotypeconcordance annotation field
 9288            genotypeconcordance_tag = "genotypeconcordance"
 9289
 9290            # VCF infos tags
 9291            vcf_infos_tags = {
 9292                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9293            }
 9294
 9295            # Prefix
 9296            prefix = self.get_explode_infos_prefix()
 9297
 9298            # Field
 9299            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9300
 9301            # Variants table
 9302            table_variants = self.get_table_variants()
 9303
 9304            # Header
 9305            vcf_reader = self.get_header()
 9306
 9307            # Create variant id
 9308            variant_id_column = self.get_variant_id_column()
 9309            added_columns = [variant_id_column]
 9310
 9311            # variant_id, FORMAT and samples
 9312            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9313                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9314            )
 9315
 9316            # Create dataframe
 9317            dataframe_genotypeconcordance = self.get_query_to_df(
 9318                f""" SELECT {samples_fields} FROM {table_variants} """
 9319            )
 9320
 9321            # Create genotypeconcordance column
 9322            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9323                dataframe_genotypeconcordance.apply(
 9324                    lambda row: genotypeconcordance(
 9325                        row, samples=self.get_header_sample_list()
 9326                    ),
 9327                    axis=1,
 9328                )
 9329            )
 9330
 9331            # Add genotypeconcordance to header
 9332            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9333                genotypeconcordance_tag,
 9334                ".",
 9335                "String",
 9336                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9337                "howard calculation",
 9338                "0",
 9339                self.code_type_map.get("String"),
 9340            )
 9341
 9342            # Update
 9343            sql_update = f"""
 9344                UPDATE variants
 9345                SET "INFO" = 
 9346                    concat(
 9347                        CASE
 9348                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9349                            THEN ''
 9350                            ELSE concat("INFO", ';')
 9351                        END,
 9352                        CASE
 9353                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9354                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9355                            THEN concat(
 9356                                    '{genotypeconcordance_tag}=',
 9357                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9358                                )
 9359                            ELSE ''
 9360                        END
 9361                    )
 9362                FROM dataframe_genotypeconcordance
 9363                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9364            """
 9365            self.conn.execute(sql_update)
 9366
 9367            # Remove added columns
 9368            for added_column in added_columns:
 9369                self.drop_column(column=added_column)
 9370
 9371            # Delete dataframe
 9372            del dataframe_genotypeconcordance
 9373            gc.collect()
 9374
 9375    def calculation_barcode(self, tag: str = "barcode") -> None:
 9376        """
 9377        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9378        updates the INFO field in the file with the calculated barcode values.
 9379
 9380        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9381        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9382        the default tag name is set to "barcode", defaults to barcode
 9383        :type tag: str (optional)
 9384        """
 9385
 9386        # if FORMAT and samples
 9387        if (
 9388            "FORMAT" in self.get_header_columns_as_list()
 9389            and self.get_header_sample_list()
 9390        ):
 9391
 9392            # barcode annotation field
 9393            if not tag:
 9394                tag = "barcode"
 9395
 9396            # VCF infos tags
 9397            vcf_infos_tags = {
 9398                tag: "barcode calculation (VaRank)",
 9399            }
 9400
 9401            # Prefix
 9402            prefix = self.get_explode_infos_prefix()
 9403
 9404            # Field
 9405            barcode_infos = prefix + tag
 9406
 9407            # Variants table
 9408            table_variants = self.get_table_variants()
 9409
 9410            # Header
 9411            vcf_reader = self.get_header()
 9412
 9413            # Create variant id
 9414            variant_id_column = self.get_variant_id_column()
 9415            added_columns = [variant_id_column]
 9416
 9417            # variant_id, FORMAT and samples
 9418            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9419                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9420            )
 9421
 9422            # Create dataframe
 9423            dataframe_barcode = self.get_query_to_df(
 9424                f""" SELECT {samples_fields} FROM {table_variants} """
 9425            )
 9426
 9427            # Create barcode column
 9428            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9429                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9430            )
 9431
 9432            # Add barcode to header
 9433            vcf_reader.infos[tag] = vcf.parser._Info(
 9434                tag,
 9435                ".",
 9436                "String",
 9437                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9438                "howard calculation",
 9439                "0",
 9440                self.code_type_map.get("String"),
 9441            )
 9442
 9443            # Update
 9444            sql_update = f"""
 9445                UPDATE {table_variants}
 9446                SET "INFO" = 
 9447                    concat(
 9448                        CASE
 9449                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9450                            THEN ''
 9451                            ELSE concat("INFO", ';')
 9452                        END,
 9453                        CASE
 9454                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9455                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9456                            THEN concat(
 9457                                    '{tag}=',
 9458                                    dataframe_barcode."{barcode_infos}"
 9459                                )
 9460                            ELSE ''
 9461                        END
 9462                    )
 9463                FROM dataframe_barcode
 9464                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9465            """
 9466            self.conn.execute(sql_update)
 9467
 9468            # Remove added columns
 9469            for added_column in added_columns:
 9470                self.drop_column(column=added_column)
 9471
 9472            # Delete dataframe
 9473            del dataframe_barcode
 9474            gc.collect()
 9475
 9476    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9477        """
 9478        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9479        and updates the INFO field in the file with the calculated barcode values.
 9480
 9481        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9482        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9483        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9484        :type tag: str (optional)
 9485        """
 9486
 9487        # if FORMAT and samples
 9488        if (
 9489            "FORMAT" in self.get_header_columns_as_list()
 9490            and self.get_header_sample_list()
 9491        ):
 9492
 9493            # barcode annotation field
 9494            if not tag:
 9495                tag = "BCF"
 9496
 9497            # VCF infos tags
 9498            vcf_infos_tags = {
 9499                tag: "barcode family calculation",
 9500                f"{tag}S": "barcode family samples",
 9501            }
 9502
 9503            # Param
 9504            param = self.get_param()
 9505            log.debug(f"param={param}")
 9506
 9507            # Prefix
 9508            prefix = self.get_explode_infos_prefix()
 9509
 9510            # PED param
 9511            ped = (
 9512                param.get("calculation", {})
 9513                .get("calculations", {})
 9514                .get("BARCODEFAMILY", {})
 9515                .get("family_pedigree", None)
 9516            )
 9517            log.debug(f"ped={ped}")
 9518
 9519            # Load PED
 9520            if ped:
 9521
 9522                # Pedigree is a file
 9523                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9524                    log.debug("Pedigree is file")
 9525                    with open(full_path(ped)) as ped:
 9526                        ped = yaml.safe_load(ped)
 9527
 9528                # Pedigree is a string
 9529                elif isinstance(ped, str):
 9530                    log.debug("Pedigree is str")
 9531                    try:
 9532                        ped = json.loads(ped)
 9533                        log.debug("Pedigree is json str")
 9534                    except ValueError as e:
 9535                        ped_samples = ped.split(",")
 9536                        ped = {}
 9537                        for ped_sample in ped_samples:
 9538                            ped[ped_sample] = ped_sample
 9539
 9540                # Pedigree is a dict
 9541                elif isinstance(ped, dict):
 9542                    log.debug("Pedigree is dict")
 9543
 9544                # Pedigree is not well formatted
 9545                else:
 9546                    msg_error = "Pedigree not well formatted"
 9547                    log.error(msg_error)
 9548                    raise ValueError(msg_error)
 9549
 9550                # Construct list
 9551                ped_samples = list(ped.values())
 9552
 9553            else:
 9554                log.debug("Pedigree not defined. Take all samples")
 9555                ped_samples = self.get_header_sample_list()
 9556                ped = {}
 9557                for ped_sample in ped_samples:
 9558                    ped[ped_sample] = ped_sample
 9559
 9560            # Check pedigree
 9561            if not ped or len(ped) == 0:
 9562                msg_error = f"Error in pedigree: samples {ped_samples}"
 9563                log.error(msg_error)
 9564                raise ValueError(msg_error)
 9565
 9566            # Log
 9567            log.info(
 9568                "Calculation 'BARCODEFAMILY' - Samples: "
 9569                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9570            )
 9571            log.debug(f"ped_samples={ped_samples}")
 9572
 9573            # Field
 9574            barcode_infos = prefix + tag
 9575
 9576            # Variants table
 9577            table_variants = self.get_table_variants()
 9578
 9579            # Header
 9580            vcf_reader = self.get_header()
 9581
 9582            # Create variant id
 9583            variant_id_column = self.get_variant_id_column()
 9584            added_columns = [variant_id_column]
 9585
 9586            # variant_id, FORMAT and samples
 9587            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9588                [f""" "{sample}" """ for sample in ped_samples]
 9589            )
 9590
 9591            # Create dataframe
 9592            dataframe_barcode = self.get_query_to_df(
 9593                f""" SELECT {samples_fields} FROM {table_variants} """
 9594            )
 9595
 9596            # Create barcode column
 9597            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9598                lambda row: barcode(row, samples=ped_samples), axis=1
 9599            )
 9600
 9601            # Add barcode family to header
 9602            # Add vaf_normalization to header
 9603            vcf_reader.formats[tag] = vcf.parser._Format(
 9604                id=tag,
 9605                num=".",
 9606                type="String",
 9607                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9608                type_code=self.code_type_map.get("String"),
 9609            )
 9610            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9611                id=f"{tag}S",
 9612                num=".",
 9613                type="String",
 9614                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9615                type_code=self.code_type_map.get("String"),
 9616            )
 9617
 9618            # Update
 9619            # for sample in ped_samples:
 9620            sql_update_set = []
 9621            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9622                if sample in ped_samples:
 9623                    value = f'dataframe_barcode."{barcode_infos}"'
 9624                    value_samples = (
 9625                        "'"
 9626                        + ",".join([f""" "{sample}" """ for sample in ped_samples])
 9627                        + "'"
 9628                    )
 9629                    ped_samples
 9630                elif sample == "FORMAT":
 9631                    value = f"'{tag}'"
 9632                    value_samples = f"'{tag}S'"
 9633                else:
 9634                    value = "'.'"
 9635                    value_samples = "'.'"
 9636                format_regex = r"[a-zA-Z0-9\s]"
 9637                sql_update_set.append(
 9638                    f"""
 9639                        "{sample}" = 
 9640                        concat(
 9641                            CASE
 9642                                WHEN {table_variants}."{sample}" = './.'
 9643                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9644                                ELSE {table_variants}."{sample}"
 9645                            END,
 9646                            ':',
 9647                            {value},
 9648                            ':',
 9649                            {value_samples}
 9650                        )
 9651                    """
 9652                )
 9653
 9654            sql_update_set_join = ", ".join(sql_update_set)
 9655            sql_update = f"""
 9656                UPDATE {table_variants}
 9657                SET {sql_update_set_join}
 9658                FROM dataframe_barcode
 9659                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9660            """
 9661            self.conn.execute(sql_update)
 9662
 9663            # Remove added columns
 9664            for added_column in added_columns:
 9665                self.drop_column(column=added_column)
 9666
 9667            # Delete dataframe
 9668            del dataframe_barcode
 9669            gc.collect()
 9670
 9671    def calculation_trio(self) -> None:
 9672        """
 9673        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9674        information to the INFO field of each variant.
 9675        """
 9676
 9677        # if FORMAT and samples
 9678        if (
 9679            "FORMAT" in self.get_header_columns_as_list()
 9680            and self.get_header_sample_list()
 9681        ):
 9682
 9683            # trio annotation field
 9684            trio_tag = "trio"
 9685
 9686            # VCF infos tags
 9687            vcf_infos_tags = {
 9688                "trio": "trio calculation",
 9689            }
 9690
 9691            # Param
 9692            param = self.get_param()
 9693
 9694            # Prefix
 9695            prefix = self.get_explode_infos_prefix()
 9696
 9697            # Trio param
 9698            trio_ped = (
 9699                param.get("calculation", {})
 9700                .get("calculations", {})
 9701                .get("TRIO", {})
 9702                .get("trio_pedigree", None)
 9703            )
 9704
 9705            # Load trio
 9706            if trio_ped:
 9707
 9708                # Trio pedigree is a file
 9709                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9710                    log.debug("TRIO pedigree is file")
 9711                    with open(full_path(trio_ped)) as trio_ped:
 9712                        trio_ped = yaml.safe_load(trio_ped)
 9713
 9714                # Trio pedigree is a string
 9715                elif isinstance(trio_ped, str):
 9716                    log.debug("TRIO pedigree is str")
 9717                    try:
 9718                        trio_ped = json.loads(trio_ped)
 9719                        log.debug("TRIO pedigree is json str")
 9720                    except ValueError as e:
 9721                        trio_samples = trio_ped.split(",")
 9722                        if len(trio_samples) == 3:
 9723                            trio_ped = {
 9724                                "father": trio_samples[0],
 9725                                "mother": trio_samples[1],
 9726                                "child": trio_samples[2],
 9727                            }
 9728                            log.debug("TRIO pedigree is list str")
 9729                        else:
 9730                            msg_error = "TRIO pedigree not well formatted"
 9731                            log.error(msg_error)
 9732                            raise ValueError(msg_error)
 9733
 9734                # Trio pedigree is a dict
 9735                elif isinstance(trio_ped, dict):
 9736                    log.debug("TRIO pedigree is dict")
 9737
 9738                # Trio pedigree is not well formatted
 9739                else:
 9740                    msg_error = "TRIO pedigree not well formatted"
 9741                    log.error(msg_error)
 9742                    raise ValueError(msg_error)
 9743
 9744                # Construct trio list
 9745                trio_samples = [
 9746                    trio_ped.get("father", ""),
 9747                    trio_ped.get("mother", ""),
 9748                    trio_ped.get("child", ""),
 9749                ]
 9750
 9751            else:
 9752                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9753                samples_list = self.get_header_sample_list()
 9754                if len(samples_list) >= 3:
 9755                    trio_samples = self.get_header_sample_list()[0:3]
 9756                    trio_ped = {
 9757                        "father": trio_samples[0],
 9758                        "mother": trio_samples[1],
 9759                        "child": trio_samples[2],
 9760                    }
 9761                else:
 9762                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9763                    log.error(msg_error)
 9764                    raise ValueError(msg_error)
 9765
 9766            # Check trio pedigree
 9767            if not trio_ped or len(trio_ped) != 3:
 9768                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9769                log.error(msg_error)
 9770                raise ValueError(msg_error)
 9771
 9772            # Log
 9773            log.info(
 9774                f"Calculation 'TRIO' - Samples: "
 9775                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9776            )
 9777
 9778            # Field
 9779            trio_infos = prefix + trio_tag
 9780
 9781            # Variants table
 9782            table_variants = self.get_table_variants()
 9783
 9784            # Header
 9785            vcf_reader = self.get_header()
 9786
 9787            # Create variant id
 9788            variant_id_column = self.get_variant_id_column()
 9789            added_columns = [variant_id_column]
 9790
 9791            # variant_id, FORMAT and samples
 9792            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9793                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9794            )
 9795
 9796            # Create dataframe
 9797            dataframe_trio = self.get_query_to_df(
 9798                f""" SELECT {samples_fields} FROM {table_variants} """
 9799            )
 9800
 9801            # Create trio column
 9802            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9803                lambda row: trio(row, samples=trio_samples), axis=1
 9804            )
 9805
 9806            # Add trio to header
 9807            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9808                trio_tag,
 9809                ".",
 9810                "String",
 9811                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9812                "howard calculation",
 9813                "0",
 9814                self.code_type_map.get("String"),
 9815            )
 9816
 9817            # Update
 9818            sql_update = f"""
 9819                UPDATE {table_variants}
 9820                SET "INFO" = 
 9821                    concat(
 9822                        CASE
 9823                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9824                            THEN ''
 9825                            ELSE concat("INFO", ';')
 9826                        END,
 9827                        CASE
 9828                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9829                             AND dataframe_trio."{trio_infos}" NOT NULL
 9830                            THEN concat(
 9831                                    '{trio_tag}=',
 9832                                    dataframe_trio."{trio_infos}"
 9833                                )
 9834                            ELSE ''
 9835                        END
 9836                    )
 9837                FROM dataframe_trio
 9838                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9839            """
 9840            self.conn.execute(sql_update)
 9841
 9842            # Remove added columns
 9843            for added_column in added_columns:
 9844                self.drop_column(column=added_column)
 9845
 9846            # Delete dataframe
 9847            del dataframe_trio
 9848            gc.collect()
 9849
 9850    def calculation_vaf_normalization(self) -> None:
 9851        """
 9852        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9853        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9854        :return: The function does not return anything.
 9855        """
 9856
 9857        # if FORMAT and samples
 9858        if (
 9859            "FORMAT" in self.get_header_columns_as_list()
 9860            and self.get_header_sample_list()
 9861        ):
 9862
 9863            # vaf_normalization annotation field
 9864            vaf_normalization_tag = "VAF"
 9865
 9866            # VCF infos tags
 9867            vcf_infos_tags = {
 9868                "VAF": "VAF Variant Frequency",
 9869            }
 9870
 9871            # Prefix
 9872            prefix = self.get_explode_infos_prefix()
 9873
 9874            # Variants table
 9875            table_variants = self.get_table_variants()
 9876
 9877            # Header
 9878            vcf_reader = self.get_header()
 9879
 9880            # Do not calculate if VAF already exists
 9881            if "VAF" in vcf_reader.formats:
 9882                log.debug("VAF already on genotypes")
 9883                return
 9884
 9885            # Create variant id
 9886            variant_id_column = self.get_variant_id_column()
 9887            added_columns = [variant_id_column]
 9888
 9889            # variant_id, FORMAT and samples
 9890            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9891                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9892            )
 9893
 9894            # Create dataframe
 9895            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9896            log.debug(f"query={query}")
 9897            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9898
 9899            vaf_normalization_set = []
 9900
 9901            # for each sample vaf_normalization
 9902            for sample in self.get_header_sample_list():
 9903                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9904                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9905                )
 9906                vaf_normalization_set.append(
 9907                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9908                )
 9909
 9910            # Add VAF to FORMAT
 9911            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9912                "FORMAT"
 9913            ].apply(lambda x: str(x) + ":VAF")
 9914            vaf_normalization_set.append(
 9915                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9916            )
 9917
 9918            # Add vaf_normalization to header
 9919            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9920                id=vaf_normalization_tag,
 9921                num="1",
 9922                type="Float",
 9923                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9924                type_code=self.code_type_map.get("Float"),
 9925            )
 9926
 9927            # Create fields to add in INFO
 9928            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9929
 9930            # Update
 9931            sql_update = f"""
 9932                UPDATE {table_variants}
 9933                SET {sql_vaf_normalization_set}
 9934                FROM dataframe_vaf_normalization
 9935                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9936
 9937            """
 9938            self.conn.execute(sql_update)
 9939
 9940            # Remove added columns
 9941            for added_column in added_columns:
 9942                self.drop_column(column=added_column)
 9943
 9944            # Delete dataframe
 9945            del dataframe_vaf_normalization
 9946            gc.collect()
 9947
 9948    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9949        """
 9950        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9951        field in a VCF file and updates the INFO column of the variants table with the calculated
 9952        statistics.
 9953
 9954        :param info: The `info` parameter is a string that represents the type of information for which
 9955        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9956        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9957        maximum value, the mean, the median, defaults to VAF
 9958        :type info: str (optional)
 9959        """
 9960
 9961        # if FORMAT and samples
 9962        if (
 9963            "FORMAT" in self.get_header_columns_as_list()
 9964            and self.get_header_sample_list()
 9965        ):
 9966
 9967            # vaf_stats annotation field
 9968            vaf_stats_tag = info + "_stats"
 9969
 9970            # VCF infos tags
 9971            vcf_infos_tags = {
 9972                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9973                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9974                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9975                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9976                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9977                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9978                info
 9979                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9980            }
 9981
 9982            # Prefix
 9983            prefix = self.get_explode_infos_prefix()
 9984
 9985            # Field
 9986            vaf_stats_infos = prefix + vaf_stats_tag
 9987
 9988            # Variants table
 9989            table_variants = self.get_table_variants()
 9990
 9991            # Header
 9992            vcf_reader = self.get_header()
 9993
 9994            # Create variant id
 9995            variant_id_column = self.get_variant_id_column()
 9996            added_columns = [variant_id_column]
 9997
 9998            # variant_id, FORMAT and samples
 9999            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
10000                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
10001            )
10002
10003            # Create dataframe
10004            dataframe_vaf_stats = self.get_query_to_df(
10005                f""" SELECT {samples_fields} FROM {table_variants} """
10006            )
10007
10008            # Create vaf_stats column
10009            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
10010                lambda row: genotype_stats(
10011                    row, samples=self.get_header_sample_list(), info=info
10012                ),
10013                axis=1,
10014            )
10015
10016            # List of vcf tags
10017            sql_vaf_stats_fields = []
10018
10019            # Check all VAF stats infos
10020            for stat in vcf_infos_tags:
10021
10022                # Extract stats
10023                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
10024                    lambda x: dict(x).get(stat, "")
10025                )
10026
10027                # Add snpeff_hgvs to header
10028                vcf_reader.infos[stat] = vcf.parser._Info(
10029                    stat,
10030                    ".",
10031                    "String",
10032                    vcf_infos_tags.get(stat, "genotype statistics"),
10033                    "howard calculation",
10034                    "0",
10035                    self.code_type_map.get("String"),
10036                )
10037
10038                if len(sql_vaf_stats_fields):
10039                    sep = ";"
10040                else:
10041                    sep = ""
10042
10043                # Create fields to add in INFO
10044                sql_vaf_stats_fields.append(
10045                    f"""
10046                        CASE
10047                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
10048                            THEN concat(
10049                                    '{sep}{stat}=',
10050                                    dataframe_vaf_stats."{stat}"
10051                                )
10052                            ELSE ''
10053                        END
10054                    """
10055                )
10056
10057            # SQL set for update
10058            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
10059
10060            # Update
10061            sql_update = f"""
10062                UPDATE {table_variants}
10063                SET "INFO" = 
10064                    concat(
10065                        CASE
10066                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10067                            THEN ''
10068                            ELSE concat("INFO", ';')
10069                        END,
10070                        {sql_vaf_stats_fields_set}
10071                    )
10072                FROM dataframe_vaf_stats
10073                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
10074
10075            """
10076            self.conn.execute(sql_update)
10077
10078            # Remove added columns
10079            for added_column in added_columns:
10080                self.drop_column(column=added_column)
10081
10082            # Delete dataframe
10083            del dataframe_vaf_stats
10084            gc.collect()
10085
10086    def calculation_transcripts_annotation(
10087        self, info_json: str = None, info_format: str = None
10088    ) -> None:
10089        """
10090        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10091        field to it if transcripts are available.
10092
10093        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10094        is a string parameter that represents the information field to be used in the transcripts JSON.
10095        It is used to specify the JSON format for the transcripts information. If no value is provided
10096        when calling the method, it defaults to "
10097        :type info_json: str
10098        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10099        method is a string parameter that specifies the format of the information field to be used in
10100        the transcripts JSON. It is used to define the format of the information field
10101        :type info_format: str
10102        """
10103
10104        # Create transcripts table
10105        transcripts_table = self.create_transcript_view()
10106
10107        # Add info field
10108        if transcripts_table:
10109            self.transcript_view_to_variants(
10110                transcripts_table=transcripts_table,
10111                transcripts_info_field_json=info_json,
10112                transcripts_info_field_format=info_format,
10113            )
10114        else:
10115            log.info("No Transcripts to process. Check param.json file configuration")
10116
10117    def calculation_transcripts_prioritization(self) -> None:
10118        """
10119        The function `calculation_transcripts_prioritization` creates a transcripts table and
10120        prioritizes transcripts based on certain criteria.
10121        """
10122
10123        # Create transcripts table
10124        transcripts_table = self.create_transcript_view()
10125
10126        # Add info field
10127        if transcripts_table:
10128            self.transcripts_prioritization(transcripts_table=transcripts_table)
10129        else:
10130            log.info("No Transcripts to process. Check param.json file configuration")
10131
10132    def calculation_transcripts_export(self) -> None:
10133        """ """
10134
10135        # Create transcripts table
10136        transcripts_table = self.create_transcript_view()
10137
10138        # Add info field
10139        if transcripts_table:
10140            self.transcripts_export(transcripts_table=transcripts_table)
10141        else:
10142            log.info("No Transcripts to process. Check param.json file configuration")
10143
10144    ###############
10145    # Transcripts #
10146    ###############
10147
10148    def transcripts_export(
10149        self, transcripts_table: str = None, param: dict = {}
10150    ) -> bool:
10151        """ """
10152
10153        log.debug("Start transcripts export...")
10154
10155        # Param
10156        if not param:
10157            param = self.get_param()
10158
10159        # Param export
10160        param_transcript_export = param.get("transcripts", {}).get("export", {})
10161
10162        # Output file
10163        transcripts_export_output = param_transcript_export.get("output", None)
10164
10165        if not param_transcript_export or not transcripts_export_output:
10166            log.warning(f"No transcriipts export parameters defined!")
10167            return False
10168
10169        # List of transcripts annotations
10170        query_describe = f"""
10171            SELECT column_name
10172            FROM (
10173                    DESCRIBE SELECT * FROM {transcripts_table}
10174                )
10175            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10176        """
10177        transcripts_annotations_list = list(
10178            self.get_query_to_df(query=query_describe)["column_name"]
10179        )
10180
10181        # Create transcripts table for export
10182        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10183            random.choices(string.ascii_uppercase + string.digits, k=10)
10184        )
10185        query_create_transcripts_table_export = f"""
10186            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10187        """
10188        self.execute_query(query=query_create_transcripts_table_export)
10189
10190        # Output file format
10191        transcripts_export_output_format = get_file_format(
10192            filename=transcripts_export_output
10193        )
10194
10195        # Format VCF - construct INFO
10196        if transcripts_export_output_format in ["vcf"]:
10197
10198            # Construct query update INFO and header
10199            query_update_info = []
10200            for field in transcripts_annotations_list:
10201
10202                # If field not in header
10203                if field not in self.get_header_infos_list():
10204
10205                    # Add PZ Transcript in header
10206                    self.get_header().infos[field] = vcf.parser._Info(
10207                        field,
10208                        ".",
10209                        "String",
10210                        f"Annotation '{field}' from transcript view",
10211                        "unknown",
10212                        "unknown",
10213                        0,
10214                    )
10215
10216                # Add field as INFO/tag
10217                query_update_info.append(
10218                    f"""
10219                        CASE
10220                            WHEN "{field}" IS NOT NULL
10221                            THEN concat('{field}=', "{field}", ';')    
10222                            ELSE ''     
10223                        END
10224                        """
10225                )
10226
10227            # Query param
10228            query_update_info_value = (
10229                f""" concat('',  {", ".join(query_update_info)}) """
10230            )
10231            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10232
10233        else:
10234
10235            # Query param
10236            query_update_info_value = f""" NULL """
10237            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10238
10239        # Update query INFO column
10240        query_update = f"""
10241            UPDATE {transcripts_table_export}
10242            SET INFO = {query_update_info_value}
10243
10244        """
10245        self.execute_query(query=query_update)
10246
10247        # Export
10248        self.export_output(
10249            output_file=transcripts_export_output,
10250            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10251        )
10252
10253        # Drop transcripts export table
10254        query_drop_transcripts_table_export = f"""
10255            DROP TABLE {transcripts_table_export}
10256        """
10257        self.execute_query(query=query_drop_transcripts_table_export)
10258
10259    def transcripts_prioritization(
10260        self, transcripts_table: str = None, param: dict = {}
10261    ) -> bool:
10262        """
10263        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10264        and updates the variants table with the prioritized information.
10265
10266        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10267        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10268        This parameter is used to identify the table where the transcripts data is stored for the
10269        prioritization process
10270        :type transcripts_table: str
10271        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10272        that contains various configuration settings for the prioritization process of transcripts. It
10273        is used to customize the behavior of the prioritization algorithm and includes settings such as
10274        the prefix for prioritization fields, default profiles, and other
10275        :type param: dict
10276        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10277        transcripts prioritization process is successfully completed, and `False` if there are any
10278        issues or if no profile is defined for transcripts prioritization.
10279        """
10280
10281        log.debug("Start transcripts prioritization...")
10282
10283        # Param
10284        if not param:
10285            param = self.get_param()
10286
10287        # Variants table
10288        table_variants = self.get_table_variants()
10289
10290        # Transcripts table
10291        if transcripts_table is None:
10292            transcripts_table = self.create_transcript_view(
10293                transcripts_table="transcripts", param=param
10294            )
10295        if transcripts_table is None:
10296            msg_err = "No Transcripts table availalble"
10297            log.error(msg_err)
10298            raise ValueError(msg_err)
10299        log.debug(f"transcripts_table={transcripts_table}")
10300
10301        # Get transcripts columns
10302        columns_as_list_query = f"""
10303            DESCRIBE {transcripts_table}
10304        """
10305        columns_as_list = list(
10306            self.get_query_to_df(columns_as_list_query)["column_name"]
10307        )
10308
10309        # Create INFO if not exists
10310        if "INFO" not in columns_as_list:
10311            query_add_info = f"""
10312                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10313            """
10314            self.execute_query(query_add_info)
10315
10316        # Prioritization param and Force only PZ Score and Flag
10317        pz_param = param.get("transcripts", {}).get("prioritization", {})
10318
10319        # PZ profile by default
10320        pz_profile_default = (
10321            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10322        )
10323
10324        # Exit if no profile
10325        if pz_profile_default is None:
10326            log.warning("No profile defined for transcripts prioritization")
10327            return False
10328
10329        # PZ fields
10330        pz_param_pzfields = {}
10331
10332        # PZ field transcripts
10333        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10334
10335        # Add PZ Transcript in header
10336        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10337            pz_fields_transcripts,
10338            ".",
10339            "String",
10340            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10341            "unknown",
10342            "unknown",
10343            code_type_map["String"],
10344        )
10345
10346        # Mandatory fields if asked in param
10347        pz_mandatory_fields_list = [
10348            "Score",
10349            "Flag",
10350            "Tags",
10351            "Comment",
10352            "Infos",
10353            "Class",
10354        ]
10355        pz_mandatory_fields = []
10356        for pz_mandatory_field in pz_mandatory_fields_list:
10357            pz_mandatory_fields.append(
10358                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10359            )
10360
10361        # PZ fields in param
10362        pz_param_mandatory_fields = []
10363        for pz_field in pz_param.get("pzfields", []):
10364            if pz_field in pz_mandatory_fields_list:
10365                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10366                    pz_param.get("pzprefix", "PTZ") + pz_field
10367                )
10368                pz_param_mandatory_fields.append(
10369                    pz_param.get("pzprefix", "PTZ") + pz_field
10370                )
10371            else:
10372                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10373                pz_param_pzfields[pz_field] = pz_field_new
10374
10375                # Add PZ Transcript in header
10376                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10377                    pz_field_new,
10378                    ".",
10379                    "String",
10380                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10381                    "unknown",
10382                    "unknown",
10383                    code_type_map["String"],
10384                )
10385
10386        # PZ fields param
10387        pz_mandatory_fields = pz_param_mandatory_fields
10388        pz_param["pzfields"] = pz_mandatory_fields
10389
10390        # Prioritization
10391        prioritization_result = self.prioritization(
10392            table=transcripts_table,
10393            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10394        )
10395        if not prioritization_result:
10396            log.warning("Transcripts prioritization not processed")
10397            return False
10398
10399        # PZ fields sql query
10400        query_update_select_list = []
10401        query_update_concat_list = []
10402        query_update_order_list = []
10403        for pz_param_pzfield in set(
10404            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10405        ):
10406            query_update_select_list.append(f" {pz_param_pzfield}, ")
10407
10408        for pz_param_pzfield in pz_param_pzfields:
10409            query_update_concat_list.append(
10410                f"""
10411                    , CASE 
10412                        WHEN {pz_param_pzfield} IS NOT NULL
10413                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10414                        ELSE ''
10415                    END
10416                """
10417            )
10418
10419        # Order by
10420        pz_orders = (
10421            param.get("transcripts", {})
10422            .get("prioritization", {})
10423            .get("prioritization_transcripts_order", {})
10424        )
10425        if not pz_orders:
10426            pz_orders = {
10427                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10428                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10429            }
10430        for pz_order in pz_orders:
10431            query_update_order_list.append(
10432                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10433            )
10434
10435        # Fields to explode
10436        fields_to_explode = (
10437            list(pz_param_pzfields.keys())
10438            + pz_mandatory_fields
10439            + list(pz_orders.keys())
10440        )
10441        # Remove transcript column as a specific transcript column
10442        if "transcript" in fields_to_explode:
10443            fields_to_explode.remove("transcript")
10444
10445        # Fields intranscripts table
10446        query_transcripts_table = f"""
10447            DESCRIBE SELECT * FROM {transcripts_table}
10448        """
10449        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10450
10451        # Check fields to explode
10452        for field_to_explode in fields_to_explode:
10453            if field_to_explode not in self.get_header_infos_list() + list(
10454                query_transcripts_table.column_name
10455            ):
10456                msg_err = f"INFO/{field_to_explode} NOT IN header"
10457                log.error(msg_err)
10458                raise ValueError(msg_err)
10459
10460        # Explode fields to explode
10461        self.explode_infos(
10462            table=transcripts_table,
10463            fields=fields_to_explode,
10464        )
10465
10466        # Transcript preference file
10467        transcripts_preference_file = (
10468            param.get("transcripts", {})
10469            .get("prioritization", {})
10470            .get("prioritization_transcripts", {})
10471        )
10472        transcripts_preference_file = full_path(transcripts_preference_file)
10473
10474        # Transcript preference forced
10475        transcript_preference_force = (
10476            param.get("transcripts", {})
10477            .get("prioritization", {})
10478            .get("prioritization_transcripts_force", False)
10479        )
10480        # Transcript version forced
10481        transcript_version_force = (
10482            param.get("transcripts", {})
10483            .get("prioritization", {})
10484            .get("prioritization_transcripts_version_force", False)
10485        )
10486
10487        # Transcripts Ranking
10488        if transcripts_preference_file:
10489
10490            # Transcripts file to dataframe
10491            if os.path.exists(transcripts_preference_file):
10492                transcripts_preference_dataframe = transcripts_file_to_df(
10493                    transcripts_preference_file
10494                )
10495            else:
10496                log.error(
10497                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10498                )
10499                raise ValueError(
10500                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10501                )
10502
10503            # Order by depending to transcript preference forcing
10504            if transcript_preference_force:
10505                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10506            else:
10507                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10508
10509            # Transcript columns joined depend on version consideration
10510            if transcript_version_force:
10511                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10512            else:
10513                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10514
10515            # Query ranking for update
10516            query_update_ranking = f"""
10517                SELECT
10518                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10519                    ROW_NUMBER() OVER (
10520                        PARTITION BY "#CHROM", POS, REF, ALT
10521                        ORDER BY {order_by}
10522                    ) AS rn
10523                FROM {transcripts_table}
10524                LEFT JOIN 
10525                    (
10526                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10527                        FROM transcripts_preference_dataframe
10528                    ) AS transcripts_preference
10529                ON {transcripts_version_join}
10530            """
10531
10532        else:
10533
10534            # Query ranking for update
10535            query_update_ranking = f"""
10536                SELECT
10537                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10538                    ROW_NUMBER() OVER (
10539                        PARTITION BY "#CHROM", POS, REF, ALT
10540                        ORDER BY {" , ".join(query_update_order_list)}
10541                    ) AS rn
10542                FROM {transcripts_table}
10543            """
10544
10545        # Export Transcripts prioritization infos to variants table
10546        query_update = f"""
10547            WITH RankedTranscripts AS (
10548                {query_update_ranking}
10549            )
10550            UPDATE {table_variants}
10551                SET
10552                INFO = CONCAT(CASE
10553                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10554                            THEN ''
10555                            ELSE concat("INFO", ';')
10556                        END,
10557                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10558                        )
10559            FROM
10560                RankedTranscripts
10561            WHERE
10562                rn = 1
10563                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10564                AND variants."POS" = RankedTranscripts."POS"
10565                AND variants."REF" = RankedTranscripts."REF"
10566                AND variants."ALT" = RankedTranscripts."ALT"     
10567        """
10568
10569        # log.debug(f"query_update={query_update}")
10570        self.execute_query(query=query_update)
10571
10572        # Return
10573        return True
10574
10575    def create_transcript_view_from_columns_map(
10576        self,
10577        transcripts_table: str = "transcripts",
10578        columns_maps: dict = {},
10579        added_columns: list = [],
10580        temporary_tables: list = None,
10581        annotation_fields: list = None,
10582        column_rename: dict = {},
10583        column_clean: bool = False,
10584        column_case: str = None,
10585    ) -> tuple[list, list, list]:
10586        """
10587        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10588        specified columns mapping for transcripts data.
10589
10590        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10591        of the table where the transcripts data is stored or will be stored in the database. This table
10592        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10593        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10594        :type transcripts_table: str (optional)
10595        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10596        about how to map columns from a transcripts table to create a view. Each entry in the
10597        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10598        typically includes details such as the main transcript column and additional information columns
10599        :type columns_maps: dict
10600        :param added_columns: The `added_columns` parameter in the
10601        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10602        that will be added to the view being created based on the columns map provided. These columns
10603        are generated by exploding the transcript information columns along with the main transcript
10604        column
10605        :type added_columns: list
10606        :param temporary_tables: The `temporary_tables` parameter in the
10607        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10608        tables created during the process of creating a transcript view from a columns map. These
10609        temporary tables are used to store intermediate results or transformations before the final view
10610        is generated
10611        :type temporary_tables: list
10612        :param annotation_fields: The `annotation_fields` parameter in the
10613        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10614        used for annotation in the query view creation process. These fields are extracted from the
10615        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10616        :type annotation_fields: list
10617        :param column_rename: The `column_rename` parameter in the
10618        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10619        custom renaming for columns during the creation of the temporary table view. This parameter
10620        provides a mapping of original column names to the desired renamed column names. By using this
10621        parameter,
10622        :type column_rename: dict
10623        :param column_clean: The `column_clean` parameter in the
10624        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10625        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10626        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10627        False
10628        :type column_clean: bool (optional)
10629        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10630        function is used to specify the case transformation to be applied to the columns during the view
10631        creation process. It allows you to control whether the column values should be converted to
10632        lowercase, uppercase, or remain unchanged
10633        :type column_case: str
10634        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10635        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10636        """
10637
10638        log.debug("Start transcrpts view creation from columns map...")
10639
10640        # "from_columns_map": [
10641        #     {
10642        #         "transcripts_column": "Ensembl_transcriptid",
10643        #         "transcripts_infos_columns": [
10644        #             "genename",
10645        #             "Ensembl_geneid",
10646        #             "LIST_S2_score",
10647        #             "LIST_S2_pred",
10648        #         ],
10649        #     },
10650        #     {
10651        #         "transcripts_column": "Ensembl_transcriptid",
10652        #         "transcripts_infos_columns": [
10653        #             "genename",
10654        #             "VARITY_R_score",
10655        #             "Aloft_pred",
10656        #         ],
10657        #     },
10658        # ],
10659
10660        # Init
10661        if temporary_tables is None:
10662            temporary_tables = []
10663        if annotation_fields is None:
10664            annotation_fields = []
10665
10666        # Variants table
10667        table_variants = self.get_table_variants()
10668
10669        for columns_map in columns_maps:
10670
10671            # Log
10672            log.debug(f"columns_map={columns_map}")
10673
10674            # Transcript column
10675            transcripts_column = columns_map.get("transcripts_column", None)
10676
10677            # Transcripts infos columns
10678            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10679
10680            # Transcripts infos columns rename
10681            column_rename = columns_map.get("column_rename", column_rename)
10682
10683            # Transcripts infos columns clean
10684            column_clean = columns_map.get("column_clean", column_clean)
10685
10686            # Transcripts infos columns case
10687            column_case = columns_map.get("column_case", column_case)
10688
10689            if transcripts_column is not None:
10690
10691                # Explode
10692                added_columns += self.explode_infos(
10693                    fields=[transcripts_column] + transcripts_infos_columns
10694                )
10695
10696                # View clauses
10697                clause_select_variants = []
10698                clause_select_tanscripts = []
10699                for field in [transcripts_column] + transcripts_infos_columns:
10700
10701                    # AS field
10702                    as_field = field
10703
10704                    # Rename
10705                    if column_rename:
10706                        as_field = column_rename.get(as_field, as_field)
10707
10708                    # Clean
10709                    if column_clean:
10710                        as_field = clean_annotation_field(as_field)
10711
10712                    # Case
10713                    if column_case:
10714                        if column_case.lower() in ["lower"]:
10715                            as_field = as_field.lower()
10716                        elif column_case.lower() in ["upper"]:
10717                            as_field = as_field.upper()
10718
10719                    # Clause select Variants
10720                    clause_select_variants.append(
10721                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10722                    )
10723
10724                    if field in [transcripts_column]:
10725                        clause_select_tanscripts.append(
10726                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10727                        )
10728                    else:
10729                        clause_select_tanscripts.append(
10730                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10731                        )
10732                        annotation_fields.append(as_field)
10733
10734                # Query View
10735                query = f""" 
10736                    SELECT
10737                        "#CHROM", POS, REF, ALT, INFO,
10738                        "{transcripts_column}" AS 'transcript',
10739                        {", ".join(clause_select_tanscripts)}
10740                    FROM (
10741                        SELECT 
10742                            "#CHROM", POS, REF, ALT, INFO,
10743                            {", ".join(clause_select_variants)}
10744                        FROM {table_variants}
10745                        )
10746                    WHERE "{transcripts_column}" IS NOT NULL
10747                """
10748
10749                # Create temporary table
10750                temporary_table = transcripts_table + "".join(
10751                    random.choices(string.ascii_uppercase + string.digits, k=10)
10752                )
10753
10754                # Temporary view
10755                temporary_tables.append(temporary_table)
10756                query_view = f"""
10757                    CREATE view {temporary_table}
10758                    AS ({query})
10759                """
10760                self.execute_query(query=query_view)
10761
10762        return added_columns, temporary_tables, annotation_fields
10763
10764    def create_transcript_view_from_column_format(
10765        self,
10766        transcripts_table: str = "transcripts",
10767        column_formats: dict = {},
10768        temporary_tables: list = None,
10769        annotation_fields: list = None,
10770        column_rename: dict = {},
10771        column_clean: bool = False,
10772        column_case: str = None,
10773    ) -> tuple[list, list, list]:
10774        """
10775        The `create_transcript_view_from_column_format` function generates a transcript view based on
10776        specified column formats, adds additional columns and annotation fields, and returns the list of
10777        temporary tables and annotation fields.
10778
10779        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10780        of the table containing the transcripts data. This table will be used as the base table for
10781        creating the transcript view. The default value for this parameter is "transcripts", but you can
10782        provide a different table name if needed, defaults to transcripts
10783        :type transcripts_table: str (optional)
10784        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10785        about the columns to be used for creating the transcript view. Each entry in the dictionary
10786        specifies the mapping between a transcripts column and a transcripts infos column. This
10787        parameter allows you to define how the columns from the transcripts table should be transformed
10788        or mapped
10789        :type column_formats: dict
10790        :param temporary_tables: The `temporary_tables` parameter in the
10791        `create_transcript_view_from_column_format` function is a list that stores the names of
10792        temporary views created during the process of creating a transcript view from a column format.
10793        These temporary views are used to manipulate and extract data before generating the final
10794        transcript view
10795        :type temporary_tables: list
10796        :param annotation_fields: The `annotation_fields` parameter in the
10797        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10798        that are extracted from the temporary views created during the process. These annotation fields
10799        are obtained by querying the temporary views and extracting the column names excluding specific
10800        columns like `#CH
10801        :type annotation_fields: list
10802        :param column_rename: The `column_rename` parameter in the
10803        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10804        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10805        column names to new column names in this dictionary, you can rename specific columns during the
10806        process
10807        :type column_rename: dict
10808        :param column_clean: The `column_clean` parameter in the
10809        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10810        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10811        will be cleaned during the creation of the transcript view based on the specified column format,
10812        defaults to False
10813        :type column_clean: bool (optional)
10814        :param column_case: The `column_case` parameter in the
10815        `create_transcript_view_from_column_format` function is used to specify the case transformation
10816        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10817        to convert the column names to uppercase or lowercase, respectively
10818        :type column_case: str
10819        :return: The `create_transcript_view_from_column_format` function returns two lists:
10820        `temporary_tables` and `annotation_fields`.
10821        """
10822
10823        log.debug("Start transcrpts view creation from column format...")
10824
10825        #  "from_column_format": [
10826        #     {
10827        #         "transcripts_column": "ANN",
10828        #         "transcripts_infos_column": "Feature_ID",
10829        #     }
10830        # ],
10831
10832        # Init
10833        if temporary_tables is None:
10834            temporary_tables = []
10835        if annotation_fields is None:
10836            annotation_fields = []
10837
10838        added_columns = []
10839
10840        for column_format in column_formats:
10841
10842            # annotation field and transcript annotation field
10843            annotation_field = column_format.get("transcripts_column", "ANN")
10844            transcript_annotation = column_format.get(
10845                "transcripts_infos_column", "Feature_ID"
10846            )
10847
10848            # Transcripts infos columns rename
10849            column_rename = column_format.get("column_rename", column_rename)
10850
10851            # Transcripts infos columns clean
10852            column_clean = column_format.get("column_clean", column_clean)
10853
10854            # Transcripts infos columns case
10855            column_case = column_format.get("column_case", column_case)
10856
10857            # Temporary View name
10858            temporary_view_name = transcripts_table + "".join(
10859                random.choices(string.ascii_uppercase + string.digits, k=10)
10860            )
10861
10862            # Create temporary view name
10863            temporary_view_name, added_columns = self.annotation_format_to_table(
10864                annotation_field=annotation_field,
10865                view_name=temporary_view_name,
10866                annotation_id=transcript_annotation,
10867                column_rename=column_rename,
10868                column_clean=column_clean,
10869                column_case=column_case,
10870            )
10871
10872            # Annotation fields
10873            if temporary_view_name:
10874                query_annotation_fields = f"""
10875                    SELECT *
10876                    FROM (
10877                        DESCRIBE SELECT *
10878                        FROM {temporary_view_name}
10879                        )
10880                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10881                """
10882                df_annotation_fields = self.get_query_to_df(
10883                    query=query_annotation_fields
10884                )
10885
10886                # Add temporary view and annotation fields
10887                temporary_tables.append(temporary_view_name)
10888                annotation_fields += list(set(df_annotation_fields["column_name"]))
10889
10890        return added_columns, temporary_tables, annotation_fields
10891
10892    def create_transcript_view(
10893        self,
10894        transcripts_table: str = None,
10895        transcripts_table_drop: bool = False,
10896        param: dict = {},
10897    ) -> str:
10898        """
10899        The `create_transcript_view` function generates a transcript view by processing data from a
10900        specified table based on provided parameters and structural information.
10901
10902        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10903        is used to specify the name of the table that will store the final transcript view data. If a table
10904        name is not provided, the function will create a new table to store the transcript view data, and by
10905        default,, defaults to transcripts
10906        :type transcripts_table: str (optional)
10907        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10908        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10909        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10910        the function will drop the existing transcripts table if it exists, defaults to False
10911        :type transcripts_table_drop: bool (optional)
10912        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10913        contains information needed to create a transcript view. It includes details such as the structure
10914        of the transcripts, columns mapping, column formats, and other necessary information for generating
10915        the view. This parameter allows for flexibility and customization
10916        :type param: dict
10917        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10918        created or modified during the execution of the function.
10919        """
10920
10921        log.debug("Start transcripts view creation...")
10922
10923        # Default
10924        transcripts_table_default = "transcripts"
10925
10926        # Param
10927        if not param:
10928            param = self.get_param()
10929
10930        # Struct
10931        struct = param.get("transcripts", {}).get("struct", None)
10932
10933        # Transcript veresion
10934        transcript_id_remove_version = param.get("transcripts", {}).get(
10935            "transcript_id_remove_version", False
10936        )
10937
10938        # Transcripts mapping
10939        transcript_id_mapping_file = param.get("transcripts", {}).get(
10940            "transcript_id_mapping_file", None
10941        )
10942
10943        # Transcripts mapping
10944        transcript_id_mapping_force = param.get("transcripts", {}).get(
10945            "transcript_id_mapping_force", None
10946        )
10947
10948        # Transcripts table
10949        if transcripts_table is None:
10950            transcripts_table = param.get("transcripts", {}).get(
10951                "table", transcripts_table_default
10952            )
10953
10954        # Check transcripts table exists
10955        if transcripts_table:
10956
10957            # Query to check if transcripts table exists
10958            query_check_table = f"""
10959                SELECT * 
10960                FROM information_schema.tables 
10961                WHERE table_name = '{transcripts_table}'
10962            """
10963            df_check_table = self.get_query_to_df(query=query_check_table)
10964
10965            # Check if transcripts table exists
10966            if len(df_check_table) > 0 and not transcripts_table_drop:
10967                log.debug(f"Table {transcripts_table} exists and not drop option")
10968                return transcripts_table
10969
10970        if struct:
10971
10972            # added_columns
10973            added_columns = []
10974
10975            # Temporary tables
10976            temporary_tables = []
10977
10978            # Annotation fields
10979            annotation_fields = []
10980
10981            # from columns map
10982            columns_maps = struct.get("from_columns_map", [])
10983            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10984                self.create_transcript_view_from_columns_map(
10985                    transcripts_table=transcripts_table,
10986                    columns_maps=columns_maps,
10987                    added_columns=added_columns,
10988                    temporary_tables=temporary_tables,
10989                    annotation_fields=annotation_fields,
10990                )
10991            )
10992            added_columns += added_columns_tmp
10993            temporary_tables += temporary_tables_tmp
10994            annotation_fields += annotation_fields_tmp
10995
10996            # from column format
10997            column_formats = struct.get("from_column_format", [])
10998            added_columns, temporary_tables_tmp, annotation_fields_tmp = (
10999                self.create_transcript_view_from_column_format(
11000                    transcripts_table=transcripts_table,
11001                    column_formats=column_formats,
11002                    temporary_tables=temporary_tables,
11003                    annotation_fields=annotation_fields,
11004                )
11005            )
11006            added_columns += added_columns_tmp
11007            temporary_tables += temporary_tables_tmp
11008            annotation_fields += annotation_fields_tmp
11009
11010            # Remove some specific fields/column
11011            annotation_fields = list(set(annotation_fields))
11012            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
11013                if field in annotation_fields:
11014                    annotation_fields.remove(field)
11015
11016            # Merge temporary tables query
11017            query_merge = ""
11018            for temporary_table in list(set(temporary_tables)):
11019
11020                # First temporary table
11021                if not query_merge:
11022                    query_merge = f"""
11023                        SELECT * FROM {temporary_table}
11024                    """
11025                # other temporary table (using UNION)
11026                else:
11027                    query_merge += f"""
11028                        UNION BY NAME SELECT * FROM {temporary_table}
11029                    """
11030
11031            # transcript table tmp
11032            transcript_table_tmp = "transcripts_tmp"
11033            transcript_table_tmp2 = "transcripts_tmp2"
11034            transcript_table_tmp3 = "transcripts_tmp3"
11035
11036            # Merge on transcript
11037            query_merge_on_transcripts_annotation_fields = []
11038
11039            # Add transcript list
11040            query_merge_on_transcripts_annotation_fields.append(
11041                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
11042            )
11043
11044            # Aggregate all annotations fields
11045            for annotation_field in set(annotation_fields):
11046                query_merge_on_transcripts_annotation_fields.append(
11047                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
11048                )
11049
11050            # Transcripts mapping
11051            if transcript_id_mapping_file:
11052
11053                # Transcript dataframe
11054                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
11055                transcript_id_mapping_dataframe = transcripts_file_to_df(
11056                    transcript_id_mapping_file, column_names=["transcript", "alias"]
11057                )
11058
11059                # Transcript version remove
11060                if transcript_id_remove_version:
11061                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
11062                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
11063                    query_left_join = f"""
11064                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11065                    """
11066                else:
11067                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
11068                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
11069                    query_left_join = f"""
11070                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11071                    """
11072
11073                # Transcript column for group by merge
11074                query_transcript_merge_group_by = """
11075                        CASE
11076                            WHEN transcript_mapped NOT IN ('')
11077                            THEN split_part(transcript_mapped, '.', 1)
11078                            ELSE split_part(transcript_original, '.', 1)
11079                        END
11080                    """
11081
11082                # Merge query
11083                transcripts_tmp2_query = f"""
11084                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
11085                    FROM ({query_merge}) AS {transcript_table_tmp}
11086                    {query_left_join}
11087                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
11088                """
11089
11090                # Retrive columns after mege
11091                transcripts_tmp2_describe_query = f"""
11092                    DESCRIBE {transcripts_tmp2_query}
11093                """
11094                transcripts_tmp2_describe_list = list(
11095                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
11096                        "column_name"
11097                    ]
11098                )
11099
11100                # Create list of columns for select clause
11101                transcripts_tmp2_describe_select_clause = []
11102                for field in transcripts_tmp2_describe_list:
11103                    if field not in [
11104                        "#CHROM",
11105                        "POS",
11106                        "REF",
11107                        "ALT",
11108                        "INFO",
11109                        "transcript_mapped",
11110                    ]:
11111                        as_field = field
11112                        if field in ["transcript_original"]:
11113                            as_field = "transcripts_mapped"
11114                        transcripts_tmp2_describe_select_clause.append(
11115                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11116                        )
11117
11118                # Merge with mapping
11119                query_merge_on_transcripts = f"""
11120                    SELECT
11121                        "#CHROM", POS, REF, ALT, INFO,
11122                        CASE
11123                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11124                            THEN ANY_VALUE(transcript_mapped)
11125                            ELSE ANY_VALUE(transcript_original)
11126                        END AS transcript,
11127                        {", ".join(transcripts_tmp2_describe_select_clause)}
11128                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11129                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11130                        {query_transcript_merge_group_by}
11131                """
11132
11133                # Add transcript filter from mapping file
11134                if transcript_id_mapping_force:
11135                    query_merge_on_transcripts = f"""
11136                        SELECT *
11137                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11138                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11139                    """
11140
11141            # No transcript mapping
11142            else:
11143
11144                # Remove transcript version
11145                if transcript_id_remove_version:
11146                    query_transcript_column = f"""
11147                        split_part({transcript_table_tmp}.transcript, '.', 1)
11148                    """
11149                else:
11150                    query_transcript_column = """
11151                        transcript
11152                    """
11153
11154                # Query sections
11155                query_transcript_column_select = (
11156                    f"{query_transcript_column} AS transcript"
11157                )
11158                query_transcript_column_group_by = query_transcript_column
11159
11160                # Query for transcripts view
11161                query_merge_on_transcripts = f"""
11162                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11163                    FROM ({query_merge}) AS {transcript_table_tmp}
11164                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11165                """
11166
11167            # Drop transcript view is necessary
11168            if transcripts_table_drop:
11169                query_drop = f"""
11170                    DROP TABLE IF EXISTS {transcripts_table};
11171                """
11172                self.execute_query(query=query_drop)
11173
11174            # List of unique #CHROM
11175            query_unique_chrom = f"""
11176                SELECT DISTINCT "#CHROM"
11177                FROM variants AS subquery
11178            """
11179            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11180
11181            # Create table with structure but without data, if not exists
11182            query_create_table = f"""
11183                CREATE TABLE IF NOT EXISTS {transcripts_table} AS
11184                SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0
11185            """
11186            self.execute_query(query=query_create_table)
11187
11188            # Process by #CHROM
11189            for chrom in unique_chroms["#CHROM"]:
11190
11191                # Log
11192                log.debug(f"Processing #CHROM={chrom}")
11193
11194                # Select data by #CHROM
11195                query_chunk = f"""
11196                    SELECT *
11197                    FROM ({query_merge_on_transcripts})
11198                    WHERE "#CHROM" = '{chrom}'
11199                """
11200
11201                # Insert data
11202                query_insert_chunk = f"""
11203                    INSERT INTO {transcripts_table}
11204                    {query_chunk}
11205                """
11206                self.execute_query(query=query_insert_chunk)
11207
11208            # Remove temporary tables
11209            if temporary_tables:
11210                for temporary_table in list(set(temporary_tables)):
11211                    try:
11212                        query_drop_tmp_table = f"""
11213                            DROP TABLE IF EXISTS {temporary_table}
11214                        """
11215                        self.execute_query(query=query_drop_tmp_table)
11216                    except Exception as e:
11217                        log.debug(f"'{temporary_table}' Not a table")
11218                    try:
11219                        query_drop_tmp_table = f"""
11220                            DROP VIEW IF EXISTS {temporary_table}
11221                        """
11222                        self.execute_query(query=query_drop_tmp_table)
11223                    except Exception as e:
11224                        log.debug(f"'{temporary_table}' Not a view")
11225
11226            # Remove added columns
11227            for added_column in added_columns:
11228                self.drop_column(column=added_column)
11229
11230        else:
11231
11232            transcripts_table = None
11233
11234        return transcripts_table
11235
11236    def annotation_format_to_table(
11237        self,
11238        annotation_field: str = "ANN",
11239        annotation_id: str = "Feature_ID",
11240        view_name: str = "transcripts",
11241        column_rename: dict = {},
11242        column_clean: bool = False,
11243        column_case: str = None,
11244    ) -> str:
11245        """
11246        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11247        structured table format, ensuring unique values and creating a temporary table for further
11248        processing or analysis.
11249
11250        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11251        unique values in the output or not. If set to `True`, the function will make sure that the
11252        output values are unique, defaults to True
11253        :type uniquify: bool (optional)
11254        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11255        that contains the annotation information for each variant. This field is used to extract the
11256        annotation details for further processing in the function. By default, it is set to "ANN",
11257        defaults to ANN
11258        :type annotation_field: str (optional)
11259        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11260        is used to specify the identifier for the annotation feature. This identifier will be used as a
11261        column name in the resulting table or view that is created based on the annotation data. It
11262        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11263        :type annotation_id: str (optional)
11264        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11265        to specify the name of the temporary table that will be created to store the transformed
11266        annotation data. This table will hold the extracted information from the annotation field in a
11267        structured format for further processing or analysis. By default,, defaults to transcripts
11268        :type view_name: str (optional)
11269        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11270        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11271        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11272        created based on the annotation data. This feature enables
11273        :type column_rename: dict
11274        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11275        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11276        If set to `True`, the function will clean the annotation field before further processing. This
11277        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11278        to False
11279        :type column_clean: bool (optional)
11280        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11281        used to specify the case transformation to be applied to the column names extracted from the
11282        annotation data. It allows you to set the case of the column names to either lowercase or
11283        uppercase for consistency or other specific requirements during the conversion
11284        :type column_case: str
11285        :return: The function `annotation_format_to_table` is returning the name of the view created,
11286        which is stored in the variable `view_name`.
11287        """
11288
11289        # Transcript annotation
11290        if column_rename:
11291            annotation_id = column_rename.get(annotation_id, annotation_id)
11292
11293        if column_clean:
11294            annotation_id = clean_annotation_field(annotation_id)
11295
11296        # Prefix
11297        prefix = self.get_explode_infos_prefix()
11298        if prefix:
11299            prefix = "INFO/"
11300
11301        # Variants table
11302        table_variants = self.get_table_variants()
11303
11304        # Header
11305        vcf_reader = self.get_header()
11306
11307        # Add columns
11308        added_columns = []
11309
11310        # Explode HGVS field in column
11311        added_columns += self.explode_infos(fields=[annotation_field])
11312
11313        if annotation_field in vcf_reader.infos:
11314
11315            # Extract ANN header
11316            ann_description = vcf_reader.infos[annotation_field].desc
11317            pattern = r"'(.+?)'"
11318            match = re.search(pattern, ann_description)
11319            if match:
11320                ann_header_match = match.group(1).split(" | ")
11321                ann_header = []
11322                ann_header_desc = {}
11323                for i in range(len(ann_header_match)):
11324                    ann_header_info = "".join(
11325                        char for char in ann_header_match[i] if char.isalnum()
11326                    )
11327                    ann_header.append(ann_header_info)
11328                    ann_header_desc[ann_header_info] = ann_header_match[i]
11329                if not ann_header_desc:
11330                    raise ValueError("Invalid header description format")
11331            else:
11332                raise ValueError("Invalid header description format")
11333
11334            # Create dataframe for keys column type
11335            dataframe_annotation_format = self.get_query_to_df(
11336                f""" 
11337                WITH exploded_annotations AS (
11338                    SELECT
11339                        UNNEST(STRING_SPLIT(ANN, ',')) AS annotation
11340                    FROM {table_variants}
11341                ),
11342                split_annotations AS (
11343                    SELECT
11344                        {", ".join([f"SPLIT_PART(annotation, '|', {i+1}) AS '{header}'" for i, header in enumerate(ann_header_desc.values())])},
11345                    FROM exploded_annotations
11346                )
11347                SELECT * FROM split_annotations
11348                LIMIT 1000
11349                """
11350            )
11351
11352            # Init
11353            query_list_keys = []
11354            key_i = 0
11355
11356            for key in dataframe_annotation_format.keys():
11357
11358                # Key
11359                key_i += 1
11360                key_clean = key
11361
11362                # key rename
11363                if column_rename:
11364                    key_clean = column_rename.get(key_clean, key_clean)
11365
11366                # key clean
11367                if column_clean:
11368                    key_clean = clean_annotation_field(key_clean)
11369
11370                # Key case
11371                if column_case:
11372                    if column_case.lower() in ["lower"]:
11373                        key_clean = key_clean.lower()
11374                    elif column_case.lower() in ["upper"]:
11375                        key_clean = key_clean.upper()
11376
11377                # Detect column type
11378                column_type = detect_column_type(dataframe_annotation_format[key])
11379
11380                # Append key to list
11381                query_list_keys.append(
11382                    f""" NULLIF(SPLIT_PART(annotation, '|', {key_i}), '')::{column_type} AS '{prefix}{key_clean}' """
11383                )
11384
11385            # Create temporary table
11386            query_create_view = f"""
11387                CREATE VIEW {view_name} AS (
11388                    WITH exploded_annotations AS (
11389                        SELECT
11390                            "#CHROM",
11391                            POS,
11392                            REF,
11393                            ALT,
11394                            INFO,
11395                            UNNEST(STRING_SPLIT(ANN, ',')) AS annotation
11396                        FROM {table_variants}
11397                    ),
11398                    split_annotations AS (
11399                        SELECT
11400                            "#CHROM",
11401                            POS,
11402                            REF,
11403                            ALT,
11404                            INFO,
11405                            {", ".join(query_list_keys)},
11406                        FROM exploded_annotations
11407                    )
11408                    SELECT *, {annotation_id} AS 'transcript' FROM split_annotations
11409                )
11410            """
11411            log.debug(f"query_create_view: {query_create_view}")
11412            self.execute_query(query=query_create_view)
11413
11414        else:
11415
11416            # Return None
11417            view_name = None
11418
11419        return view_name, added_columns
11420
11421    def transcript_view_to_variants(
11422        self,
11423        transcripts_table: str = None,
11424        transcripts_column_id: str = None,
11425        transcripts_info_json: str = None,
11426        transcripts_info_field_json: str = None,
11427        transcripts_info_format: str = None,
11428        transcripts_info_field_format: str = None,
11429        param: dict = {},
11430    ) -> bool:
11431        """
11432        The `transcript_view_to_variants` function updates a variants table with information from
11433        transcripts in JSON format.
11434
11435        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11436        table containing the transcripts data. If this parameter is not provided, the function will
11437        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11438        :type transcripts_table: str
11439        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11440        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11441        identifier is used to match transcripts with variants in the database
11442        :type transcripts_column_id: str
11443        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11444        of the column in the variants table where the transcripts information will be stored in JSON
11445        format. This parameter allows you to define the column in the variants table that will hold the
11446        JSON-formatted information about transcripts
11447        :type transcripts_info_json: str
11448        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11449        specify the field in the VCF header that will contain information about transcripts in JSON
11450        format. This field will be added to the VCF header as an INFO field with the specified name
11451        :type transcripts_info_field_json: str
11452        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11453        format of the information about transcripts that will be stored in the variants table. This
11454        format can be used to define how the transcript information will be structured or displayed
11455        within the variants table
11456        :type transcripts_info_format: str
11457        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11458        specify the field in the VCF header that will contain information about transcripts in a
11459        specific format. This field will be added to the VCF header as an INFO field with the specified
11460        name
11461        :type transcripts_info_field_format: str
11462        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11463        that contains various configuration settings related to transcripts. It is used to provide
11464        default values for certain parameters if they are not explicitly provided when calling the
11465        method. The `param` dictionary can be passed as an argument
11466        :type param: dict
11467        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11468        if the operation is successful and `False` if certain conditions are not met.
11469        """
11470
11471        msg_info_prefix = "Start transcripts view to variants annotations"
11472
11473        log.debug(f"{msg_info_prefix}...")
11474
11475        # Default
11476        transcripts_table_default = "transcripts"
11477        transcripts_column_id_default = "transcript"
11478        transcripts_info_json_default = None
11479        transcripts_info_format_default = None
11480        transcripts_info_field_json_default = None
11481        transcripts_info_field_format_default = None
11482
11483        # Param
11484        if not param:
11485            param = self.get_param()
11486
11487        # Transcripts table
11488        if transcripts_table is None:
11489            transcripts_table = param.get("transcripts", {}).get(
11490                "table", transcripts_table_default
11491            )
11492
11493        # Transcripts column ID
11494        if transcripts_column_id is None:
11495            transcripts_column_id = param.get("transcripts", {}).get(
11496                "column_id", transcripts_column_id_default
11497            )
11498
11499        # Transcripts info json
11500        if transcripts_info_json is None:
11501            transcripts_info_json = param.get("transcripts", {}).get(
11502                "transcripts_info_json", transcripts_info_json_default
11503            )
11504
11505        # Transcripts info field JSON
11506        if transcripts_info_field_json is None:
11507            transcripts_info_field_json = param.get("transcripts", {}).get(
11508                "transcripts_info_field_json", transcripts_info_field_json_default
11509            )
11510        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11511        #     transcripts_info_json = transcripts_info_field_json
11512
11513        # Transcripts info format
11514        if transcripts_info_format is None:
11515            transcripts_info_format = param.get("transcripts", {}).get(
11516                "transcripts_info_format", transcripts_info_format_default
11517            )
11518
11519        # Transcripts info field FORMAT
11520        if transcripts_info_field_format is None:
11521            transcripts_info_field_format = param.get("transcripts", {}).get(
11522                "transcripts_info_field_format", transcripts_info_field_format_default
11523            )
11524        # if (
11525        #     transcripts_info_field_format is not None
11526        #     and transcripts_info_format is None
11527        # ):
11528        #     transcripts_info_format = transcripts_info_field_format
11529
11530        # Variants table
11531        table_variants = self.get_table_variants()
11532
11533        # Check info columns param
11534        if (
11535            transcripts_info_json is None
11536            and transcripts_info_field_json is None
11537            and transcripts_info_format is None
11538            and transcripts_info_field_format is None
11539        ):
11540            return False
11541
11542        # Transcripts infos columns
11543        query_transcripts_infos_columns = f"""
11544            SELECT *
11545            FROM (
11546                DESCRIBE SELECT * FROM {transcripts_table}
11547                )
11548            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11549        """
11550        transcripts_infos_columns = list(
11551            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11552        )
11553
11554        # View results
11555        clause_select = []
11556        clause_to_json = []
11557        clause_to_format = []
11558        for field in transcripts_infos_columns:
11559            # Do not consider INFO field for export into fields
11560            if field not in ["INFO"]:
11561                clause_select.append(
11562                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11563                )
11564                clause_to_json.append(f""" '{field}': "{field}" """)
11565                clause_to_format.append(f""" "{field}" """)
11566
11567        # Update
11568        update_set_json = []
11569        update_set_format = []
11570
11571        # VCF header
11572        vcf_reader = self.get_header()
11573
11574        # Transcripts to info column in JSON
11575        if transcripts_info_json:
11576
11577            # Create column on variants table
11578            self.add_column(
11579                table_name=table_variants,
11580                column_name=transcripts_info_json,
11581                column_type="JSON",
11582                default_value=None,
11583                drop=False,
11584            )
11585
11586            # Add header
11587            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11588                transcripts_info_json,
11589                ".",
11590                "String",
11591                "Transcripts in JSON format",
11592                "unknwon",
11593                "unknwon",
11594                self.code_type_map["String"],
11595            )
11596
11597            # Add to update
11598            update_set_json.append(
11599                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11600            )
11601
11602        # Transcripts to info field in JSON
11603        if transcripts_info_field_json:
11604
11605            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11606
11607            # Add to update
11608            update_set_json.append(
11609                f""" 
11610                    INFO = concat(
11611                            CASE
11612                                WHEN INFO NOT IN ('', '.')
11613                                THEN INFO
11614                                ELSE ''
11615                            END,
11616                            CASE
11617                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11618                                THEN concat(
11619                                    ';{transcripts_info_field_json}=',
11620                                    t.{transcripts_info_json}
11621                                )
11622                                ELSE ''
11623                            END
11624                            )
11625                """
11626            )
11627
11628            # Add header
11629            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11630                transcripts_info_field_json,
11631                ".",
11632                "String",
11633                "Transcripts in JSON format",
11634                "unknwon",
11635                "unknwon",
11636                self.code_type_map["String"],
11637            )
11638
11639        if update_set_json:
11640
11641            # Update query
11642            query_update = f"""
11643                UPDATE {table_variants}
11644                    SET {", ".join(update_set_json)}
11645                FROM
11646                (
11647                    SELECT
11648                        "#CHROM", POS, REF, ALT,
11649                            concat(
11650                            '{{',
11651                            string_agg(
11652                                '"' || "{transcripts_column_id}" || '":' ||
11653                                to_json(json_output)
11654                            ),
11655                            '}}'
11656                            )::JSON AS {transcripts_info_json}
11657                    FROM
11658                        (
11659                        SELECT
11660                            "#CHROM", POS, REF, ALT,
11661                            "{transcripts_column_id}",
11662                            to_json(
11663                                {{{",".join(clause_to_json)}}}
11664                            )::JSON AS json_output
11665                        FROM
11666                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11667                        WHERE "{transcripts_column_id}" IS NOT NULL
11668                        )
11669                    GROUP BY "#CHROM", POS, REF, ALT
11670                ) AS t
11671                WHERE {table_variants}."#CHROM" = t."#CHROM"
11672                    AND {table_variants}."POS" = t."POS"
11673                    AND {table_variants}."REF" = t."REF"
11674                    AND {table_variants}."ALT" = t."ALT"
11675            """
11676
11677            self.execute_query(query=query_update)
11678
11679        # Transcripts to info column in FORMAT
11680        if transcripts_info_format:
11681
11682            # Create column on variants table
11683            self.add_column(
11684                table_name=table_variants,
11685                column_name=transcripts_info_format,
11686                column_type="VARCHAR",
11687                default_value=None,
11688                drop=False,
11689            )
11690
11691            # Add header
11692            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11693                transcripts_info_format,
11694                ".",
11695                "String",
11696                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11697                "unknwon",
11698                "unknwon",
11699                self.code_type_map["String"],
11700            )
11701
11702            # Add to update
11703            update_set_format.append(
11704                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11705            )
11706
11707        else:
11708
11709            # Set variable for internal queries
11710            transcripts_info_format = "transcripts_info_format"
11711
11712        # Transcripts to info field in JSON
11713        if transcripts_info_field_format:
11714
11715            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11716
11717            # Add to update
11718            update_set_format.append(
11719                f""" 
11720                    INFO = concat(
11721                            CASE
11722                                WHEN INFO NOT IN ('', '.')
11723                                THEN INFO
11724                                ELSE ''
11725                            END,
11726                            CASE
11727                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11728                                THEN concat(
11729                                    ';{transcripts_info_field_format}=',
11730                                    t.{transcripts_info_format}
11731                                )
11732                                ELSE ''
11733                            END
11734                            )
11735                """
11736            )
11737
11738            # Add header
11739            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11740                transcripts_info_field_format,
11741                ".",
11742                "String",
11743                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11744                "unknwon",
11745                "unknwon",
11746                self.code_type_map["String"],
11747            )
11748
11749        if update_set_format:
11750
11751            # Update query
11752            query_update = f"""
11753                UPDATE {table_variants}
11754                    SET {", ".join(update_set_format)}
11755                FROM
11756                (
11757                    SELECT
11758                        "#CHROM", POS, REF, ALT,
11759                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11760                    FROM 
11761                        (
11762                        SELECT
11763                            "#CHROM", POS, REF, ALT,
11764                            "{transcripts_column_id}",
11765                            concat(
11766                                "{transcripts_column_id}",
11767                                '|',
11768                                {", '|', ".join(clause_to_format)}
11769                            ) AS {transcripts_info_format}
11770                        FROM
11771                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11772                        )
11773                    GROUP BY "#CHROM", POS, REF, ALT
11774                ) AS t
11775                WHERE {table_variants}."#CHROM" = t."#CHROM"
11776                    AND {table_variants}."POS" = t."POS"
11777                    AND {table_variants}."REF" = t."REF"
11778                    AND {table_variants}."ALT" = t."ALT"
11779            """
11780
11781            self.execute_query(query=query_update)
11782
11783        return True
11784
11785    def rename_info_fields(
11786        self, fields_to_rename: dict = None, table: str = None
11787    ) -> dict:
11788        """
11789        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11790        corresponding INFO fields in the variants table.
11791
11792        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11793        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11794        represent the original field names that need to be renamed, and the corresponding values
11795        represent the new names to which the fields should be
11796        :type fields_to_rename: dict
11797        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11798        the table in which the variants data is stored. This table contains information about genetic
11799        variants, and the function updates the corresponding INFO fields in this table when renaming
11800        specified fields in the VCF file header
11801        :type table: str
11802        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11803        the original field names as keys and their corresponding new names (or None if the field was
11804        removed) as values after renaming or removing specified fields in a VCF file header and updating
11805        corresponding INFO fields in the variants table.
11806        """
11807
11808        # Init
11809        fields_renamed = {}
11810        config = self.get_config()
11811        access = config.get("access")
11812
11813        if table is None:
11814            table = self.get_table_variants()
11815
11816        # regexp replace fonction
11817        regex_replace_dict = {}
11818        regex_replace_nb = 0
11819        regex_replace_partition = 125
11820        regex_replace = "concat(INFO, ';')"  # Add ';' to reduce regexp comlexity
11821
11822        if fields_to_rename is not None and access not in ["RO"]:
11823
11824            log.info("Rename or remove fields...")
11825
11826            # Header
11827            header = self.get_header()
11828
11829            for field_to_rename, field_renamed in fields_to_rename.items():
11830
11831                if field_to_rename in header.infos:
11832
11833                    # Rename header
11834                    if field_renamed is not None:
11835                        header.infos[field_renamed] = vcf.parser._Info(
11836                            field_renamed,
11837                            header.infos[field_to_rename].num,
11838                            header.infos[field_to_rename].type,
11839                            header.infos[field_to_rename].desc,
11840                            header.infos[field_to_rename].source,
11841                            header.infos[field_to_rename].version,
11842                            header.infos[field_to_rename].type_code,
11843                        )
11844                    del header.infos[field_to_rename]
11845
11846                    # Rename INFO patterns
11847                    field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;"
11848                    if field_renamed is not None:
11849                        field_renamed_pattern = rf"\1{field_renamed}\3;"
11850                    else:
11851                        field_renamed_pattern = r"\1"
11852
11853                    # regexp replace
11854                    regex_replace_nb += 1
11855                    regex_replace_key = math.floor(
11856                        regex_replace_nb / regex_replace_partition
11857                    )
11858                    if (regex_replace_nb % regex_replace_partition) == 0:
11859                        regex_replace = "concat(INFO, ';')"
11860                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11861                    regex_replace_dict[regex_replace_key] = regex_replace
11862
11863                    # Return
11864                    fields_renamed[field_to_rename] = field_renamed
11865
11866                    # Log
11867                    if field_renamed is not None:
11868                        log.info(
11869                            f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'"
11870                        )
11871                    else:
11872                        log.info(
11873                            f"Rename or remove fields - field '{field_to_rename}' removed"
11874                        )
11875
11876                else:
11877
11878                    log.warning(
11879                        f"Rename or remove fields - field '{field_to_rename}' not in header"
11880                    )
11881
11882            # Rename INFO
11883            for regex_replace_key, regex_replace in regex_replace_dict.items():
11884                log.info(
11885                    f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..."
11886                )
11887                query = f"""
11888                    UPDATE {table}
11889                    SET
11890                        INFO = regexp_replace({regex_replace}, ';$', '')
11891                """
11892                log.debug(f"query={query}")
11893                self.execute_query(query=query)
11894
11895        return fields_renamed
11896
11897    def calculation_rename_info_fields(
11898        self,
11899        fields_to_rename: dict = None,
11900        table: str = None,
11901        operation_name: str = "RENAME_INFO_FIELDS",
11902    ) -> None:
11903        """
11904        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11905        fields to rename and table if provided, and then calls another function to rename the fields.
11906
11907        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11908        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11909        the key and the new field name as the value
11910        :type fields_to_rename: dict
11911        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11912        specify the name of the table for which the fields are to be renamed. It is a string type
11913        parameter
11914        :type table: str
11915        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11916        method is a string that specifies the name of the operation being performed. In this context, it
11917        is used as a default value for the operation name if not explicitly provided when calling the
11918        function, defaults to RENAME_INFO_FIELDS
11919        :type operation_name: str (optional)
11920        """
11921
11922        # Param
11923        param = self.get_param()
11924
11925        # Get param fields to rename
11926        param_fields_to_rename = (
11927            param.get("calculation", {})
11928            .get("calculations", {})
11929            .get(operation_name, {})
11930            .get("fields_to_rename", None)
11931        )
11932
11933        # Get param table
11934        param_table = (
11935            param.get("calculation", {})
11936            .get("calculations", {})
11937            .get(operation_name, {})
11938            .get("table", None)
11939        )
11940
11941        # Init fields_to_rename
11942        if fields_to_rename is None:
11943            fields_to_rename = param_fields_to_rename
11944
11945        # Init table
11946        if table is None:
11947            table = param_table
11948
11949        renamed_fields = self.rename_info_fields(
11950            fields_to_rename=fields_to_rename, table=table
11951        )
11952
11953        log.debug(f"renamed_fields:{renamed_fields}")
11954
11955    def create_annotations_view(
11956        self,
11957        table: str = None,
11958        view: str = None,
11959        view_type: str = None,
11960        fields: list = None,
11961        prefix: str = "",
11962        drop_view: bool = False,
11963        fields_to_rename: dict = None,
11964        limit: int = None,
11965    ) -> str:
11966        """
11967        The `create_annotations_view` function creates a SQL view from fields in a VCF INFO column.
11968
11969        :param table: The `table` parameter in the `create_annotations_view` function is used to specify
11970        the name of the table from which the fields are to be extracted. This table contains the
11971        variants data, and the function creates a view based on the fields in the INFO column of this
11972        table
11973        :type table: str
11974        :param view: The `view` parameter in the `create_annotations_view` function is used to specify
11975        the name of the view that will be created based on the fields in the VCF INFO column. This view
11976        will contain the extracted fields from the INFO column in a structured format for further
11977        processing or analysis
11978        :type view: str
11979        :param view_type: The `view_type` parameter in the `create_annotations_view` function is used to
11980        specify the type of view that will be created. It can be either a `VIEW` or a `TABLE`, and the
11981        function will create the view based on the specified type
11982        :type view_type: str
11983        :param fields: The `fields` parameter in the `create_annotations_view` function is a list that
11984        contains the names of the fields to be extracted from the INFO column in the VCF file. These
11985        fields will be used to create the view with the specified columns and data extracted from the
11986        INFO column
11987        :type fields: list
11988        :param prefix: The `prefix` parameter in the `create_annotations_view` function is used to
11989        specify a prefix that will be added to the field names in the view. This prefix helps in
11990        distinguishing the fields extracted from the INFO column in the view
11991        :type prefix: str
11992        :param drop_view: The `drop_view` parameter in the `create_annotations_view` function is a boolean
11993        flag that determines whether to drop the existing view with the same name before creating a new
11994        view. If set to `True`, the function will drop the existing view before creating a new view with
11995        the specified name
11996        :type drop_view: bool
11997        :param fields_to_rename: The `fields_to_rename` parameter in the `create_annotations_view`
11998        function is a dictionary that contains the mapping of fields to be renamed in the VCF file. The
11999        keys in the dictionary represent the original field names that need to be renamed, and the
12000        corresponding values represent the new names to which the fields should be
12001        :type fields_to_rename: dict
12002        :param limit: The `limit` parameter in the `create_annotations_view` function is an integer that
12003        specifies the maximum number of rows to be included in the view. If provided, the function will
12004        limit the number of rows in the view to the specified value
12005        :type limit: int
12006        :return: The `create_annotations_view` function returns the name of the view that is created
12007        based on the fields extracted from the INFO column in the VCF file. This view contains the
12008        extracted fields in a structured format for further processing or analysis
12009        """
12010
12011        # Create a sql view from fields in VCF INFO column, with each column is a field present in the VCF header (with a specific type from VCF header) and extracted from INFO column (with a regexp like in rename_info_fields), and each row is a variant.
12012
12013        # Get table
12014        if table is None:
12015            table = self.get_table_variants()
12016
12017        # Get view
12018        if view is None:
12019            view = f"{table}_annotations"
12020
12021        # Get view type
12022        if view_type is None:
12023            view_type = "VIEW"
12024
12025        # Check view type value
12026        if view_type.upper() not in ["VIEW", "TABLE"]:
12027            raise ValueError(
12028                f"Invalid view type value: {view_type}. Either 'VIEW' or 'TABLE'"
12029            )
12030
12031        # Get header
12032        header = self.get_header()
12033
12034        # Get fields
12035        if fields is None:
12036            fields = list(header.infos.keys())
12037
12038        # Get fields to rename
12039        if fields_to_rename is None:
12040            fields_to_rename = {}
12041
12042        log.info(
12043            f"Create '{view}' view (as '{view_type}') from table '{table}' with {len(fields)} fields"
12044        )
12045
12046        # Describe table
12047        table_describe_query = f"""
12048            DESCRIBE {table}
12049        """
12050        table_describe = self.get_query_to_df(query=table_describe_query)
12051
12052        # Create fields for annotation view extracted from INFO column in table variants (with regexp_replace like in rename_info_fields), with column type from VCF header
12053        fields_columns = []
12054        fields_needed = ["#CHROM", "POS", "REF", "ALT"]
12055        field_sql_type_list = False
12056        for field in fields:
12057
12058            # Rename field
12059            field_to_rename = fields_to_rename.get(field, field)
12060
12061            # Check field type
12062
12063            # Needed fields
12064            if field in fields_needed:
12065                continue
12066
12067            # Fields in table
12068            elif field in list(table_describe.get("column_name")):
12069                fields_columns.append(f""" "{field}" AS '{prefix}{field_to_rename}' """)
12070
12071            # Fields in header
12072            elif field in header.infos:
12073
12074                # Field info
12075                field_infos = header.infos.get(field, None)
12076
12077                # Field SQL type
12078                field_sql_type = code_type_map_to_sql.get(field_infos.type, "VARCHAR")
12079
12080                # Column is a list
12081                if field_infos.num != 1:
12082                    field_sql_type_list = True
12083
12084                # Colonne is a flag
12085                if field_infos.type == "Flag":
12086                    field_pattern = rf"(^|;)({field})([^;]*)?"
12087                    fields_columns.append(
12088                        f""" regexp_matches("INFO", '{field_pattern}')::BOOLEAN AS '{prefix}{field_to_rename}' """
12089                    )
12090
12091                # Colonne with a type
12092                else:
12093
12094                    # Field pattern
12095                    field_pattern = rf"(^|;)({field})=([^;]*)?"
12096
12097                    # Field is a list
12098                    if field_sql_type_list:
12099                        fields_columns.append(
12100                            f""" CAST(list_transform(string_split(NULLIF(regexp_extract("INFO", '{field_pattern}', 3), ''), ','), x -> CASE WHEN x = '.' OR x = '' THEN NULL ELSE x END) AS {field_sql_type}[]) AS '{prefix}{field_to_rename}' """
12101                        )
12102
12103                    # Field is a unique value
12104                    else:
12105                        fields_columns.append(
12106                            f""" NULLIF(regexp_replace(regexp_extract("INFO", '{field_pattern}', 3), '^\\.$', ''), '')::{field_sql_type} AS '{prefix}{field_to_rename}' """
12107                        )
12108
12109            else:
12110                fields_columns.append(f""" null AS '{prefix}{field_to_rename}' """)
12111                msg_err = f"Field '{field}' is not found (in table or header): '{field}' will be set to NULL"
12112                log.warning(msg=msg_err)
12113
12114        # Limit
12115        limit_clause = ""
12116        if limit is not None:
12117            limit_clause = f" LIMIT {limit} "
12118
12119        # Query select
12120        query_select = f"""
12121            SELECT
12122                {', '.join([f'"{field}"' for field in fields_needed])}, {", ".join(fields_columns)}
12123            FROM
12124                {table}
12125            {limit_clause}
12126        """
12127
12128        # Drop if any
12129        if drop_view:
12130            log.debug(f"Drop view: {view}")
12131            query_create_view = f"""
12132                DROP {view_type} IF EXISTS {view}
12133            """
12134            self.execute_query(query=query_create_view)
12135            log.debug(f"View dropped: {view}")
12136
12137        # Create view
12138        log.debug(f"Create view: {view}")
12139        query_create_view = f"""
12140            CREATE {view_type} IF NOT EXISTS {view} AS {query_select}
12141        """
12142        # log.debug(f"query_create_view:{query_create_view}")
12143        self.execute_query(query=query_create_view)
12144        log.debug(f"View created: {view}")
12145
12146        return view
class Variants:
   37class Variants:
   38
   39    def __init__(
   40        self,
   41        conn=None,
   42        input: str = None,
   43        output: str = None,
   44        config: dict = {},
   45        param: dict = {},
   46        load: bool = False,
   47    ) -> None:
   48        """
   49        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   50        header
   51
   52        :param conn: the connection to the database
   53        :param input: the input file
   54        :param output: the output file
   55        :param config: a dictionary containing the configuration of the model
   56        :param param: a dictionary containing the parameters of the model
   57        """
   58
   59        # Init variables
   60        self.init_variables()
   61
   62        # Input
   63        self.set_input(input)
   64
   65        # Config
   66        self.set_config(config)
   67
   68        # Param
   69        self.set_param(param)
   70
   71        # Output
   72        self.set_output(output)
   73
   74        # connexion
   75        self.set_connexion(conn)
   76
   77        # Header
   78        self.set_header()
   79
   80        # Samples
   81        self.set_samples()
   82
   83        # Load data
   84        if load:
   85            self.load_data()
   86
   87    def set_samples(self, samples: list = None) -> list:
   88        """
   89        The function `set_samples` sets the samples attribute of an object to a provided list or
   90        retrieves it from a parameter dictionary.
   91
   92        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   93        input and sets the `samples` attribute of the class to the provided list. If no samples are
   94        provided, it tries to get the samples from the class's parameters using the `get_param` method
   95        :type samples: list
   96        :return: The `samples` list is being returned.
   97        """
   98
   99        if not samples:
  100            samples = self.get_param().get("samples", {}).get("list", None)
  101
  102        self.samples = samples
  103
  104        return samples
  105
  106    def get_samples(self) -> list:
  107        """
  108        This function returns a list of samples.
  109        :return: The `get_samples` method is returning the `samples` attribute of the object.
  110        """
  111
  112        return self.samples
  113
  114    def get_samples_check(self) -> bool:
  115        """
  116        This function returns the value of the "check" key within the "samples" dictionary retrieved
  117        from the parameters.
  118        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  119        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  120        method. If the key "check" is not found, it will return `False`.
  121        """
  122
  123        return self.get_param().get("samples", {}).get("check", True)
  124
  125    def set_input(self, input: str = None) -> None:
  126        """
  127        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  128        attributes in the class accordingly.
  129
  130        :param input: The `set_input` method in the provided code snippet is used to set attributes
  131        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  132        :type input: str
  133        """
  134
  135        if input and not isinstance(input, str):
  136            try:
  137                self.input = input.name
  138            except:
  139                log.error(f"Input file '{input} in bad format")
  140                raise ValueError(f"Input file '{input} in bad format")
  141        else:
  142            self.input = input
  143
  144        # Input format
  145        if input:
  146            input_name, input_extension = os.path.splitext(self.input)
  147            self.input_name = input_name
  148            self.input_extension = input_extension
  149            self.input_format = self.input_extension.replace(".", "")
  150
  151    def set_config(self, config: dict) -> None:
  152        """
  153        The set_config function takes a config object and assigns it as the configuration object for the
  154        class.
  155
  156        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  157        contains configuration settings for the class. When you call the `set_config` function with a
  158        dictionary object as the argument, it will set that dictionary as the configuration object for
  159        the class
  160        :type config: dict
  161        """
  162
  163        self.config = config
  164
  165    def set_param(self, param: dict) -> None:
  166        """
  167        This function sets a parameter object for the class based on the input dictionary.
  168
  169        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  170        as the `param` attribute of the class instance
  171        :type param: dict
  172        """
  173
  174        self.param = param
  175
  176    def init_variables(self) -> None:
  177        """
  178        This function initializes the variables that will be used in the rest of the class
  179        """
  180
  181        self.prefix = "howard"
  182        self.table_variants = "variants"
  183        self.dataframe = None
  184
  185        self.comparison_map = {
  186            "gt": ">",
  187            "gte": ">=",
  188            "lt": "<",
  189            "lte": "<=",
  190            "equals": "=",
  191            "contains": "SIMILAR TO",
  192        }
  193
  194        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  195
  196        self.code_type_map_to_sql = {
  197            "Integer": "INTEGER",
  198            "String": "VARCHAR",
  199            "Float": "FLOAT",
  200            "Flag": "VARCHAR",
  201        }
  202
  203        self.index_additionnal_fields = []
  204
  205    def get_indexing(self) -> bool:
  206        """
  207        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  208        returns False.
  209        :return: The value of the indexing parameter.
  210        """
  211
  212        return self.get_param().get("indexing", False)
  213
  214    def get_connexion_config(self) -> dict:
  215        """
  216        The function `get_connexion_config` returns a dictionary containing the configuration for a
  217        connection, including the number of threads and memory limit.
  218        :return: a dictionary containing the configuration for the Connexion library.
  219        """
  220
  221        # config
  222        config = self.get_config()
  223
  224        # Connexion config
  225        connexion_config = {}
  226        threads = self.get_threads()
  227
  228        # Threads
  229        if threads:
  230            connexion_config["threads"] = threads
  231
  232        # Memory
  233        # if config.get("memory", None):
  234        #     connexion_config["memory_limit"] = config.get("memory")
  235        if self.get_memory():
  236            connexion_config["memory_limit"] = self.get_memory()
  237
  238        # Temporary directory
  239        if config.get("tmp", None):
  240            connexion_config["temp_directory"] = config.get("tmp")
  241
  242        # Access
  243        if config.get("access", None):
  244            access = config.get("access")
  245            if access in ["RO"]:
  246                access = "READ_ONLY"
  247            elif access in ["RW"]:
  248                access = "READ_WRITE"
  249            connexion_db = self.get_connexion_db()
  250            if connexion_db in ":memory:":
  251                access = "READ_WRITE"
  252            connexion_config["access_mode"] = access
  253
  254        return connexion_config
  255
  256    def get_duckdb_settings(self) -> dict:
  257        """
  258        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  259        string.
  260        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  261        """
  262
  263        # config
  264        config = self.get_config()
  265
  266        # duckdb settings
  267        duckdb_settings_dict = {}
  268        if config.get("duckdb_settings", None):
  269            duckdb_settings = config.get("duckdb_settings")
  270            duckdb_settings = full_path(duckdb_settings)
  271            # duckdb setting is a file
  272            if os.path.exists(duckdb_settings):
  273                with open(duckdb_settings) as json_file:
  274                    duckdb_settings_dict = yaml.safe_load(json_file)
  275            # duckdb settings is a string
  276            else:
  277                duckdb_settings_dict = json.loads(duckdb_settings)
  278
  279        return duckdb_settings_dict
  280
  281    def set_connexion_db(self) -> str:
  282        """
  283        The function `set_connexion_db` returns the appropriate database connection string based on the
  284        input format and connection type.
  285        :return: the value of the variable `connexion_db`.
  286        """
  287
  288        # Default connexion db
  289        default_connexion_db = ":memory:"
  290
  291        # Find connexion db
  292        if self.get_input_format() in ["db", "duckdb"]:
  293            connexion_db = self.get_input()
  294        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  295            connexion_db = default_connexion_db
  296        elif self.get_connexion_type() in ["tmpfile"]:
  297            tmp_name = tempfile.mkdtemp(
  298                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  299            )
  300            connexion_db = f"{tmp_name}/tmp.db"
  301        elif self.get_connexion_type() != "":
  302            connexion_db = self.get_connexion_type()
  303        else:
  304            connexion_db = default_connexion_db
  305
  306        # Set connexion db
  307        self.connexion_db = connexion_db
  308
  309        return connexion_db
  310
  311    def set_connexion(self, conn) -> None:
  312        """
  313        The function `set_connexion` creates a connection to a database, with options for different
  314        database formats and settings.
  315
  316        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  317        database. If a connection is not provided, a new connection to an in-memory database is created.
  318        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  319        sqlite
  320        """
  321
  322        # Connexion db
  323        connexion_db = self.set_connexion_db()
  324
  325        # Connexion config
  326        connexion_config = self.get_connexion_config()
  327
  328        # Connexion format
  329        connexion_format = self.get_config().get("connexion_format", "duckdb")
  330        # Set connexion format
  331        self.connexion_format = connexion_format
  332
  333        # Connexion
  334        if not conn:
  335            if connexion_format in ["duckdb"]:
  336                conn = duckdb.connect(connexion_db, config=connexion_config)
  337                # duckDB settings
  338                duckdb_settings = self.get_duckdb_settings()
  339                if duckdb_settings:
  340                    for setting in duckdb_settings:
  341                        setting_value = duckdb_settings.get(setting)
  342                        if isinstance(setting_value, str):
  343                            setting_value = f"'{setting_value}'"
  344                        conn.execute(f"PRAGMA {setting}={setting_value};")
  345            elif connexion_format in ["sqlite"]:
  346                conn = sqlite3.connect(connexion_db)
  347
  348        # Set connexion
  349        self.conn = conn
  350
  351        # Log
  352        log.debug(f"connexion_format: {connexion_format}")
  353        log.debug(f"connexion_db: {connexion_db}")
  354        log.debug(f"connexion config: {connexion_config}")
  355        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  356
  357    def set_output(self, output: str = None) -> None:
  358        """
  359        The `set_output` function in Python sets the output file based on the input or a specified key
  360        in the config file, extracting the output name, extension, and format.
  361
  362        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  363        the output file. If the config file has an 'output' key, the method sets the output to the value
  364        of that key. If no output is provided, it sets the output to `None`
  365        :type output: str
  366        """
  367
  368        if output and not isinstance(output, str):
  369            self.output = output.name
  370        else:
  371            self.output = output
  372
  373        # Output format
  374        if self.output:
  375            output_name, output_extension = os.path.splitext(self.output)
  376            self.output_name = output_name
  377            self.output_extension = output_extension
  378            self.output_format = self.output_extension.replace(".", "")
  379        else:
  380            self.output_name = None
  381            self.output_extension = None
  382            self.output_format = None
  383
  384    def set_header(self) -> None:
  385        """
  386        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  387        """
  388
  389        input_file = self.get_input()
  390        default_header_list = [
  391            "##fileformat=VCFv4.2",
  392            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  393        ]
  394
  395        # Full path
  396        input_file = full_path(input_file)
  397
  398        if input_file:
  399
  400            input_format = self.get_input_format()
  401            input_compressed = self.get_input_compressed()
  402            config = self.get_config()
  403            header_list = default_header_list
  404            if input_format in [
  405                "vcf",
  406                "hdr",
  407                "tsv",
  408                "csv",
  409                "psv",
  410                "parquet",
  411                "db",
  412                "duckdb",
  413            ]:
  414                # header provided in param
  415                if config.get("header_file", None):
  416                    with open(config.get("header_file"), "rt") as f:
  417                        header_list = self.read_vcf_header(f)
  418                # within a vcf file format (header within input file itsself)
  419                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  420                    # within a compressed vcf file format (.vcf.gz)
  421                    if input_compressed:
  422                        with bgzf.open(input_file, "rt") as f:
  423                            header_list = self.read_vcf_header(f)
  424                    # within an uncompressed vcf file format (.vcf)
  425                    else:
  426                        with open(input_file, "rt") as f:
  427                            header_list = self.read_vcf_header(f)
  428                # header provided in default external file .hdr
  429                elif os.path.exists((input_file + ".hdr")):
  430                    with open(input_file + ".hdr", "rt") as f:
  431                        header_list = self.read_vcf_header(f)
  432                else:
  433                    try:  # Try to get header info fields and file columns
  434
  435                        with tempfile.TemporaryDirectory() as tmpdir:
  436
  437                            # Create database
  438                            db_for_header = Database(database=input_file)
  439
  440                            # Get header columns for infos fields
  441                            db_header_from_columns = (
  442                                db_for_header.get_header_from_columns()
  443                            )
  444
  445                            # Get real columns in the file
  446                            db_header_columns = db_for_header.get_columns()
  447
  448                            # Write header file
  449                            header_file_tmp = os.path.join(tmpdir, "header")
  450                            f = open(header_file_tmp, "w")
  451                            vcf.Writer(f, db_header_from_columns)
  452                            f.close()
  453
  454                            # Replace #CHROM line with rel columns
  455                            header_list = db_for_header.read_header_file(
  456                                header_file=header_file_tmp
  457                            )
  458                            header_list[-1] = "\t".join(db_header_columns)
  459
  460                    except:
  461
  462                        log.warning(
  463                            f"No header for file {input_file}. Set as default VCF header"
  464                        )
  465                        header_list = default_header_list
  466
  467            else:  # try for unknown format ?
  468
  469                log.error(f"Input file format '{input_format}' not available")
  470                raise ValueError(f"Input file format '{input_format}' not available")
  471
  472            if not header_list:
  473                header_list = default_header_list
  474
  475            # header as list
  476            self.header_list = header_list
  477
  478            # header as VCF object
  479            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  480
  481        else:
  482
  483            self.header_list = None
  484            self.header_vcf = None
  485
  486    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  487        """
  488        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  489        DataFrame based on the connection format.
  490
  491        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  492        represents the SQL query you want to execute. This query will be used to fetch data from a
  493        database and convert it into a pandas DataFrame
  494        :type query: str
  495        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  496        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  497        function will only fetch up to that number of rows from the database query result. If no limit
  498        is specified,
  499        :type limit: int
  500        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  501        """
  502
  503        # Connexion format
  504        connexion_format = self.get_connexion_format()
  505
  506        # Limit in query
  507        if limit:
  508            pd.set_option("display.max_rows", limit)
  509            if connexion_format in ["duckdb"]:
  510                df = (
  511                    self.conn.execute(query)
  512                    .fetch_record_batch(limit)
  513                    .read_next_batch()
  514                    .to_pandas()
  515                )
  516            elif connexion_format in ["sqlite"]:
  517                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  518
  519        # Full query
  520        else:
  521            if connexion_format in ["duckdb"]:
  522                df = self.conn.execute(query).df()
  523            elif connexion_format in ["sqlite"]:
  524                df = pd.read_sql_query(query, self.conn)
  525
  526        return df
  527
  528    def get_overview(self) -> None:
  529        """
  530        The function prints the input, output, config, and dataframe of the current object
  531        """
  532        table_variants_from = self.get_table_variants(clause="from")
  533        sql_columns = self.get_header_columns_as_sql()
  534        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  535        df = self.get_query_to_df(sql_query_export)
  536        log.info(
  537            "Input:  "
  538            + str(self.get_input())
  539            + " ["
  540            + str(str(self.get_input_format()))
  541            + "]"
  542        )
  543        log.info(
  544            "Output: "
  545            + str(self.get_output())
  546            + " ["
  547            + str(str(self.get_output_format()))
  548            + "]"
  549        )
  550        log.info("Config: ")
  551        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  552            "\n"
  553        ):
  554            log.info("\t" + str(d))
  555        log.info("Param: ")
  556        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  557            "\n"
  558        ):
  559            log.info("\t" + str(d))
  560        log.info("Sample list: " + str(self.get_header_sample_list()))
  561        log.info("Dataframe: ")
  562        for d in str(df).split("\n"):
  563            log.info("\t" + str(d))
  564
  565        # garbage collector
  566        del df
  567        gc.collect()
  568
  569        return None
  570
  571    def get_stats(self) -> dict:
  572        """
  573        The `get_stats` function calculates and returns various statistics of the current object,
  574        including information about the input file, variants, samples, header fields, quality, and
  575        SNVs/InDels.
  576        :return: a dictionary containing various statistics of the current object. The dictionary has
  577        the following structure:
  578        """
  579
  580        # Log
  581        log.info(f"Stats Calculation...")
  582
  583        # table varaints
  584        table_variants_from = self.get_table_variants()
  585
  586        # stats dict
  587        stats = {"Infos": {}}
  588
  589        ### File
  590        input_file = self.get_input()
  591        stats["Infos"]["Input file"] = input_file
  592
  593        # Header
  594        header_infos = self.get_header().infos
  595        header_formats = self.get_header().formats
  596        header_infos_list = list(header_infos)
  597        header_formats_list = list(header_formats)
  598
  599        ### Variants
  600
  601        stats["Variants"] = {}
  602
  603        # Variants by chr
  604        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  605        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  606        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  607            by=["CHROM"], kind="quicksort"
  608        )
  609
  610        # Total number of variants
  611        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  612
  613        # Calculate percentage
  614        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  615            lambda x: (x / nb_of_variants)
  616        )
  617
  618        stats["Variants"]["Number of variants by chromosome"] = (
  619            nb_of_variants_by_chrom.to_dict(orient="index")
  620        )
  621
  622        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  623
  624        ### Samples
  625
  626        # Init
  627        samples = {}
  628        nb_of_samples = 0
  629
  630        # Check Samples
  631        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  632            log.debug(f"Check samples...")
  633            for sample in self.get_header_sample_list():
  634                sql_query_samples = f"""
  635                    SELECT  '{sample}' as sample,
  636                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  637                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  638                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  639                    FROM {table_variants_from}
  640                    WHERE (
  641                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  642                        AND
  643                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  644                      )
  645                    GROUP BY genotype
  646                    """
  647                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  648                sample_genotype_count = sql_query_genotype_df["count"].sum()
  649                if len(sql_query_genotype_df):
  650                    nb_of_samples += 1
  651                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  652                        sql_query_genotype_df.to_dict(orient="index")
  653                    )
  654
  655            stats["Samples"] = samples
  656            stats["Infos"]["Number of samples"] = nb_of_samples
  657
  658        # #
  659        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  660        #     stats["Infos"]["Number of samples"] = nb_of_samples
  661        # elif nb_of_samples:
  662        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  663
  664        ### INFO and FORMAT fields
  665        header_types_df = {}
  666        header_types_list = {
  667            "List of INFO fields": header_infos,
  668            "List of FORMAT fields": header_formats,
  669        }
  670        i = 0
  671        for header_type in header_types_list:
  672
  673            header_type_infos = header_types_list.get(header_type)
  674            header_infos_dict = {}
  675
  676            for info in header_type_infos:
  677
  678                i += 1
  679                header_infos_dict[i] = {}
  680
  681                # ID
  682                header_infos_dict[i]["id"] = info
  683
  684                # num
  685                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  686                if header_type_infos[info].num in genotype_map.keys():
  687                    header_infos_dict[i]["Number"] = genotype_map.get(
  688                        header_type_infos[info].num
  689                    )
  690                else:
  691                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  692
  693                # type
  694                if header_type_infos[info].type:
  695                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  696                else:
  697                    header_infos_dict[i]["Type"] = "."
  698
  699                # desc
  700                if header_type_infos[info].desc != None:
  701                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  702                else:
  703                    header_infos_dict[i]["Description"] = ""
  704
  705            if len(header_infos_dict):
  706                header_types_df[header_type] = pd.DataFrame.from_dict(
  707                    header_infos_dict, orient="index"
  708                ).to_dict(orient="index")
  709
  710        # Stats
  711        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  712        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  713        stats["Header"] = header_types_df
  714
  715        ### QUAL
  716        if "QUAL" in self.get_header_columns():
  717            sql_query_qual = f"""
  718                    SELECT
  719                        avg(CAST(QUAL AS INTEGER)) AS Average,
  720                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  721                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  722                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  723                        median(CAST(QUAL AS INTEGER)) AS Median,
  724                        variance(CAST(QUAL AS INTEGER)) AS Variance
  725                    FROM {table_variants_from}
  726                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  727                    """
  728
  729            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  730            stats["Quality"] = {"Stats": qual}
  731
  732        ### SNV and InDel
  733
  734        sql_query_snv = f"""
  735            
  736            SELECT Type, count FROM (
  737
  738                    SELECT
  739                        'Total' AS Type,
  740                        count(*) AS count
  741                    FROM {table_variants_from}
  742
  743                    UNION
  744
  745                    SELECT
  746                        'MNV' AS Type,
  747                        count(*) AS count
  748                    FROM {table_variants_from}
  749                    WHERE len(REF) > 1 AND len(ALT) > 1
  750                    AND len(REF) = len(ALT)
  751
  752                    UNION
  753
  754                    SELECT
  755                        'InDel' AS Type,
  756                        count(*) AS count
  757                    FROM {table_variants_from}
  758                    WHERE len(REF) > 1 OR len(ALT) > 1
  759                    AND len(REF) != len(ALT)
  760                    
  761                    UNION
  762
  763                    SELECT
  764                        'SNV' AS Type,
  765                        count(*) AS count
  766                    FROM {table_variants_from}
  767                    WHERE len(REF) = 1 AND len(ALT) = 1
  768
  769                )
  770
  771            ORDER BY count DESC
  772
  773                """
  774        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  775
  776        sql_query_snv_substitution = f"""
  777                SELECT
  778                    concat(REF, '>', ALT) AS 'Substitution',
  779                    count(*) AS count
  780                FROM {table_variants_from}
  781                WHERE len(REF) = 1 AND len(ALT) = 1
  782                GROUP BY REF, ALT
  783                ORDER BY count(*) DESC
  784                """
  785        snv_substitution = (
  786            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  787        )
  788        stats["Variants"]["Counts"] = snv_indel
  789        stats["Variants"]["Substitutions"] = snv_substitution
  790
  791        return stats
  792
  793    def stats_to_file(self, file: str = None) -> str:
  794        """
  795        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  796        into a JSON object, and writes the JSON object to the specified file.
  797
  798        :param file: The `file` parameter is a string that represents the file path where the JSON data
  799        will be written
  800        :type file: str
  801        :return: the name of the file that was written to.
  802        """
  803
  804        # Get stats
  805        stats = self.get_stats()
  806
  807        # Serializing json
  808        json_object = json.dumps(stats, indent=4)
  809
  810        # Writing to sample.json
  811        with open(file, "w") as outfile:
  812            outfile.write(json_object)
  813
  814        return file
  815
  816    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  817        """
  818        The `print_stats` function generates a markdown file and prints the statistics contained in a
  819        JSON file in a formatted manner.
  820
  821        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  822        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  823        provided, a temporary directory will be created and the stats will be saved in a file named
  824        "stats.md" within that
  825        :type output_file: str
  826        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  827        file where the statistics will be saved. If no value is provided, a temporary directory will be
  828        created and a default file name "stats.json" will be used
  829        :type json_file: str
  830        :return: The function `print_stats` does not return any value. It has a return type annotation
  831        of `None`.
  832        """
  833
  834        # Full path
  835        output_file = full_path(output_file)
  836        json_file = full_path(json_file)
  837
  838        with tempfile.TemporaryDirectory() as tmpdir:
  839
  840            # Files
  841            if not output_file:
  842                output_file = os.path.join(tmpdir, "stats.md")
  843            if not json_file:
  844                json_file = os.path.join(tmpdir, "stats.json")
  845
  846            # Create folders
  847            if not os.path.exists(os.path.dirname(output_file)):
  848                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  849            if not os.path.exists(os.path.dirname(json_file)):
  850                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  851
  852            # Create stats JSON file
  853            stats_file = self.stats_to_file(file=json_file)
  854
  855            # Print stats file
  856            with open(stats_file) as f:
  857                stats = yaml.safe_load(f)
  858
  859            # Output
  860            output_title = []
  861            output_index = []
  862            output = []
  863
  864            # Title
  865            output_title.append("# HOWARD Stats")
  866
  867            # Index
  868            output_index.append("## Index")
  869
  870            # Process sections
  871            for section in stats:
  872                infos = stats.get(section)
  873                section_link = "#" + section.lower().replace(" ", "-")
  874                output.append(f"## {section}")
  875                output_index.append(f"- [{section}]({section_link})")
  876
  877                if len(infos):
  878                    for info in infos:
  879                        try:
  880                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  881                            is_df = True
  882                        except:
  883                            try:
  884                                df = pd.DataFrame.from_dict(
  885                                    json.loads((infos.get(info))), orient="index"
  886                                )
  887                                is_df = True
  888                            except:
  889                                is_df = False
  890                        if is_df:
  891                            output.append(f"### {info}")
  892                            info_link = "#" + info.lower().replace(" ", "-")
  893                            output_index.append(f"   - [{info}]({info_link})")
  894                            output.append(f"{df.to_markdown(index=False)}")
  895                        else:
  896                            output.append(f"- {info}: {infos.get(info)}")
  897                else:
  898                    output.append(f"NA")
  899
  900            # Write stats in markdown file
  901            with open(output_file, "w") as fp:
  902                for item in output_title:
  903                    fp.write("%s\n" % item)
  904                for item in output_index:
  905                    fp.write("%s\n" % item)
  906                for item in output:
  907                    fp.write("%s\n" % item)
  908
  909            # Output stats in markdown
  910            print("")
  911            print("\n\n".join(output_title))
  912            print("")
  913            print("\n\n".join(output))
  914            print("")
  915
  916        return None
  917
  918    def get_input(self) -> str:
  919        """
  920        It returns the value of the input variable.
  921        :return: The input is being returned.
  922        """
  923        return self.input
  924
  925    def get_input_format(self, input_file: str = None) -> str:
  926        """
  927        This function returns the format of the input variable, either from the provided input file or
  928        by prompting for input.
  929
  930        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  931        represents the file path of the input file. If no `input_file` is provided when calling the
  932        method, it will default to `None`
  933        :type input_file: str
  934        :return: The format of the input variable is being returned.
  935        """
  936
  937        if not input_file:
  938            input_file = self.get_input()
  939        input_format = get_file_format(input_file)
  940        return input_format
  941
  942    def get_input_compressed(self, input_file: str = None) -> str:
  943        """
  944        The function `get_input_compressed` returns the format of the input variable after compressing
  945        it.
  946
  947        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  948        that represents the file path of the input file. If no `input_file` is provided when calling the
  949        method, it will default to `None` and the method will then call `self.get_input()` to
  950        :type input_file: str
  951        :return: The function `get_input_compressed` returns the compressed format of the input
  952        variable.
  953        """
  954
  955        if not input_file:
  956            input_file = self.get_input()
  957        input_compressed = get_file_compressed(input_file)
  958        return input_compressed
  959
  960    def get_output(self) -> str:
  961        """
  962        It returns the output of the neuron.
  963        :return: The output of the neural network.
  964        """
  965
  966        return self.output
  967
  968    def get_output_format(self, output_file: str = None) -> str:
  969        """
  970        The function `get_output_format` returns the format of the input variable or the output file if
  971        provided.
  972
  973        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  974        that represents the file path of the output file. If no `output_file` is provided when calling
  975        the method, it will default to the output obtained from the `get_output` method of the class
  976        instance. The
  977        :type output_file: str
  978        :return: The format of the input variable is being returned.
  979        """
  980
  981        if not output_file:
  982            output_file = self.get_output()
  983        output_format = get_file_format(output_file)
  984
  985        return output_format
  986
  987    def get_config(self) -> dict:
  988        """
  989        It returns the config
  990        :return: The config variable is being returned.
  991        """
  992        return self.config
  993
  994    def get_param(self) -> dict:
  995        """
  996        It returns the param
  997        :return: The param variable is being returned.
  998        """
  999        return self.param
 1000
 1001    def get_connexion_db(self) -> str:
 1002        """
 1003        It returns the connexion_db attribute of the object
 1004        :return: The connexion_db is being returned.
 1005        """
 1006        return self.connexion_db
 1007
 1008    def get_prefix(self) -> str:
 1009        """
 1010        It returns the prefix of the object.
 1011        :return: The prefix is being returned.
 1012        """
 1013        return self.prefix
 1014
 1015    def get_table_variants(self, clause: str = "select") -> str:
 1016        """
 1017        This function returns the table_variants attribute of the object
 1018
 1019        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1020        defaults to select (optional)
 1021        :return: The table_variants attribute of the object.
 1022        """
 1023
 1024        # Access
 1025        access = self.get_config().get("access", None)
 1026
 1027        # Clauses "select", "where", "update"
 1028        if clause in ["select", "where", "update"]:
 1029            table_variants = self.table_variants
 1030        # Clause "from"
 1031        elif clause in ["from"]:
 1032            # For Read Only
 1033            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1034                input_file = self.get_input()
 1035                table_variants = f"'{input_file}' as variants"
 1036            # For Read Write
 1037            else:
 1038                table_variants = f"{self.table_variants} as variants"
 1039        else:
 1040            table_variants = self.table_variants
 1041        return table_variants
 1042
 1043    def get_tmp_dir(self) -> str:
 1044        """
 1045        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1046        parameters or a default path.
 1047        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1048        configuration, parameters, and a default value of "/tmp".
 1049        """
 1050
 1051        return get_tmp(
 1052            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1053        )
 1054
 1055    def get_connexion_type(self) -> str:
 1056        """
 1057        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1058
 1059        :return: The connexion type is being returned.
 1060        """
 1061        return self.get_config().get("connexion_type", "memory")
 1062
 1063    def get_connexion(self):
 1064        """
 1065        It returns the connection object
 1066
 1067        :return: The connection object.
 1068        """
 1069        return self.conn
 1070
 1071    def close_connexion(self) -> None:
 1072        """
 1073        This function closes the connection to the database.
 1074        :return: The connection is being closed.
 1075        """
 1076        return self.conn.close()
 1077
 1078    def get_header(self, type: str = "vcf"):
 1079        """
 1080        This function returns the header of the VCF file as a list of strings
 1081
 1082        :param type: the type of header you want to get, defaults to vcf (optional)
 1083        :return: The header of the vcf file.
 1084        """
 1085
 1086        if self.header_vcf:
 1087            if type == "vcf":
 1088                return self.header_vcf
 1089            elif type == "list":
 1090                return self.header_list
 1091        else:
 1092            if type == "vcf":
 1093                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1094                return header
 1095            elif type == "list":
 1096                return vcf_required
 1097
 1098    def get_header_infos_list(self) -> list:
 1099        """
 1100        This function retrieves a list of information fields from the header.
 1101        :return: A list of information fields from the header.
 1102        """
 1103
 1104        # Init
 1105        infos_list = []
 1106
 1107        for field in self.get_header().infos:
 1108            infos_list.append(field)
 1109
 1110        return infos_list
 1111
 1112    def get_header_length(self, file: str = None) -> int:
 1113        """
 1114        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1115        line.
 1116
 1117        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1118        header file. If this argument is provided, the function will read the header from the specified
 1119        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1120        :type file: str
 1121        :return: the length of the header list, excluding the #CHROM line.
 1122        """
 1123
 1124        if file:
 1125            return len(self.read_vcf_header_file(file=file)) - 1
 1126        elif self.get_header(type="list"):
 1127            return len(self.get_header(type="list")) - 1
 1128        else:
 1129            return 0
 1130
 1131    def get_header_columns(self) -> str:
 1132        """
 1133        This function returns the header list of a VCF
 1134
 1135        :return: The length of the header list.
 1136        """
 1137        if self.get_header():
 1138            return self.get_header(type="list")[-1]
 1139        else:
 1140            return ""
 1141
 1142    def get_header_columns_as_list(self) -> list:
 1143        """
 1144        This function returns the header list of a VCF
 1145
 1146        :return: The length of the header list.
 1147        """
 1148        if self.get_header():
 1149            return self.get_header_columns().strip().split("\t")
 1150        else:
 1151            return []
 1152
 1153    def get_header_columns_as_sql(self) -> str:
 1154        """
 1155        This function retruns header length (without #CHROM line)
 1156
 1157        :return: The length of the header list.
 1158        """
 1159        sql_column_list = []
 1160        for col in self.get_header_columns_as_list():
 1161            sql_column_list.append(f'"{col}"')
 1162        return ",".join(sql_column_list)
 1163
 1164    def get_header_sample_list(
 1165        self, check: bool = False, samples: list = None, samples_force: bool = False
 1166    ) -> list:
 1167        """
 1168        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1169        checking and filtering based on input parameters.
 1170
 1171        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1172        parameter that determines whether to check if the samples in the list are properly defined as
 1173        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1174        list is defined as a, defaults to False
 1175        :type check: bool (optional)
 1176        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1177        allows you to specify a subset of samples from the header. If you provide a list of sample
 1178        names, the function will check if each sample is defined in the header. If a sample is not found
 1179        in the
 1180        :type samples: list
 1181        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1182        a boolean parameter that determines whether to force the function to return the sample list
 1183        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1184        function will return the sample list without performing, defaults to False
 1185        :type samples_force: bool (optional)
 1186        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1187        parameters and conditions specified in the function.
 1188        """
 1189
 1190        # Init
 1191        samples_list = []
 1192
 1193        if samples is None:
 1194            samples_list = self.header_vcf.samples
 1195        else:
 1196            samples_checked = []
 1197            for sample in samples:
 1198                if sample in self.header_vcf.samples:
 1199                    samples_checked.append(sample)
 1200                else:
 1201                    log.warning(f"Sample '{sample}' not defined in header")
 1202            samples_list = samples_checked
 1203
 1204            # Force sample list without checking if is_genotype_column
 1205            if samples_force:
 1206                log.warning(f"Samples {samples_list} not checked if genotypes")
 1207                return samples_list
 1208
 1209        if check:
 1210            samples_checked = []
 1211            for sample in samples_list:
 1212                if self.is_genotype_column(column=sample):
 1213                    samples_checked.append(sample)
 1214                else:
 1215                    log.warning(
 1216                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1217                    )
 1218            samples_list = samples_checked
 1219
 1220        # Return samples list
 1221        return samples_list
 1222
 1223    def is_genotype_column(self, column: str = None) -> bool:
 1224        """
 1225        This function checks if a given column is a genotype column in a database.
 1226
 1227        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1228        represents the column name in a database table. This method checks if the specified column is a
 1229        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1230        method of
 1231        :type column: str
 1232        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1233        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1234        column name and returns the result. If the `column` parameter is None, it returns False.
 1235        """
 1236
 1237        if column is not None:
 1238            return Database(database=self.get_input()).is_genotype_column(column=column)
 1239        else:
 1240            return False
 1241
 1242    def get_verbose(self) -> bool:
 1243        """
 1244        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1245        exist
 1246
 1247        :return: The value of the key "verbose" in the config dictionary.
 1248        """
 1249        return self.get_config().get("verbose", False)
 1250
 1251    def get_connexion_format(self) -> str:
 1252        """
 1253        It returns the connexion format of the object.
 1254        :return: The connexion_format is being returned.
 1255        """
 1256        connexion_format = self.connexion_format
 1257        if connexion_format not in ["duckdb", "sqlite"]:
 1258            log.error(f"Unknown connexion format {connexion_format}")
 1259            raise ValueError(f"Unknown connexion format {connexion_format}")
 1260        else:
 1261            return connexion_format
 1262
 1263    def insert_file_to_table(
 1264        self,
 1265        file,
 1266        columns: str,
 1267        header_len: int = 0,
 1268        sep: str = "\t",
 1269        chunksize: int = 1000000,
 1270    ) -> None:
 1271        """
 1272        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1273        database format.
 1274
 1275        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1276        the path to the file on your system
 1277        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1278        should contain the names of the columns in the table where the data will be inserted. The column
 1279        names should be separated by commas within the string. For example, if you have columns named
 1280        "id", "name
 1281        :type columns: str
 1282        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1283        the number of lines to skip at the beginning of the file before reading the actual data. This
 1284        parameter allows you to skip any header information present in the file before processing the
 1285        data, defaults to 0
 1286        :type header_len: int (optional)
 1287        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1288        separator character that is used in the file being read. In this case, the default separator is
 1289        set to `\t`, which represents a tab character. You can change this parameter to a different
 1290        separator character if, defaults to \t
 1291        :type sep: str (optional)
 1292        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1293        when processing the file in chunks. In the provided code snippet, the default value for
 1294        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1295        to 1000000
 1296        :type chunksize: int (optional)
 1297        """
 1298
 1299        # Config
 1300        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1301        connexion_format = self.get_connexion_format()
 1302
 1303        log.debug("chunksize: " + str(chunksize))
 1304
 1305        if chunksize:
 1306            for chunk in pd.read_csv(
 1307                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1308            ):
 1309                if connexion_format in ["duckdb"]:
 1310                    sql_insert_into = (
 1311                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1312                    )
 1313                    self.conn.execute(sql_insert_into)
 1314                elif connexion_format in ["sqlite"]:
 1315                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1316
 1317    def load_data(
 1318        self,
 1319        input_file: str = None,
 1320        drop_variants_table: bool = False,
 1321        sample_size: int = 20480,
 1322    ) -> None:
 1323        """
 1324        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1325        table before loading the data and specify a sample size.
 1326
 1327        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1328        table
 1329        :type input_file: str
 1330        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1331        determines whether the variants table should be dropped before loading the data. If set to
 1332        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1333        not be dropped, defaults to False
 1334        :type drop_variants_table: bool (optional)
 1335        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1336        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1337        20480
 1338        :type sample_size: int (optional)
 1339        """
 1340
 1341        log.info("Loading...")
 1342
 1343        # change input file
 1344        if input_file:
 1345            self.set_input(input_file)
 1346            self.set_header()
 1347
 1348        # drop variants table
 1349        if drop_variants_table:
 1350            self.drop_variants_table()
 1351
 1352        # get table variants
 1353        table_variants = self.get_table_variants()
 1354
 1355        # Access
 1356        access = self.get_config().get("access", None)
 1357        log.debug(f"access: {access}")
 1358
 1359        # Input format and compress
 1360        input_format = self.get_input_format()
 1361        input_compressed = self.get_input_compressed()
 1362        log.debug(f"input_format: {input_format}")
 1363        log.debug(f"input_compressed: {input_compressed}")
 1364
 1365        # input_compressed_format
 1366        if input_compressed:
 1367            input_compressed_format = "gzip"
 1368        else:
 1369            input_compressed_format = "none"
 1370        log.debug(f"input_compressed_format: {input_compressed_format}")
 1371
 1372        # Connexion format
 1373        connexion_format = self.get_connexion_format()
 1374
 1375        # Sample size
 1376        if not sample_size:
 1377            sample_size = -1
 1378        log.debug(f"sample_size: {sample_size}")
 1379
 1380        # Load data
 1381        log.debug(f"Load Data from {input_format}")
 1382
 1383        # DuckDB connexion
 1384        if connexion_format in ["duckdb"]:
 1385
 1386            # Database already exists
 1387            if self.input_format in ["db", "duckdb"]:
 1388
 1389                if connexion_format in ["duckdb"]:
 1390                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1391                else:
 1392                    log.error(
 1393                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1394                    )
 1395                    raise ValueError(
 1396                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1397                    )
 1398
 1399            # Load from existing database format
 1400            else:
 1401
 1402                try:
 1403                    # Create Table or View
 1404                    database = Database(database=self.input)
 1405                    sql_from = database.get_sql_from(sample_size=sample_size)
 1406
 1407                    if access in ["RO"]:
 1408                        sql_load = (
 1409                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1410                        )
 1411                    else:
 1412                        sql_load = (
 1413                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1414                        )
 1415                    self.conn.execute(sql_load)
 1416
 1417                except:
 1418                    # Format not available
 1419                    log.error(f"Input file format '{self.input_format}' not available")
 1420                    raise ValueError(
 1421                        f"Input file format '{self.input_format}' not available"
 1422                    )
 1423
 1424        # SQLite connexion
 1425        elif connexion_format in ["sqlite"] and input_format in [
 1426            "vcf",
 1427            "tsv",
 1428            "csv",
 1429            "psv",
 1430        ]:
 1431
 1432            # Main structure
 1433            structure = {
 1434                "#CHROM": "VARCHAR",
 1435                "POS": "INTEGER",
 1436                "ID": "VARCHAR",
 1437                "REF": "VARCHAR",
 1438                "ALT": "VARCHAR",
 1439                "QUAL": "VARCHAR",
 1440                "FILTER": "VARCHAR",
 1441                "INFO": "VARCHAR",
 1442            }
 1443
 1444            # Strcuture with samples
 1445            structure_complete = structure
 1446            if self.get_header_sample_list():
 1447                structure["FORMAT"] = "VARCHAR"
 1448                for sample in self.get_header_sample_list():
 1449                    structure_complete[sample] = "VARCHAR"
 1450
 1451            # Columns list for create and insert
 1452            sql_create_table_columns = []
 1453            sql_create_table_columns_list = []
 1454            for column in structure_complete:
 1455                column_type = structure_complete[column]
 1456                sql_create_table_columns.append(
 1457                    f'"{column}" {column_type} default NULL'
 1458                )
 1459                sql_create_table_columns_list.append(f'"{column}"')
 1460
 1461            # Create database
 1462            log.debug(f"Create Table {table_variants}")
 1463            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1464            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1465            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1466            self.conn.execute(sql_create_table)
 1467
 1468            # chunksize define length of file chunk load file
 1469            chunksize = 100000
 1470
 1471            # delimiter
 1472            delimiter = file_format_delimiters.get(input_format, "\t")
 1473
 1474            # Load the input file
 1475            with open(self.input, "rt") as input_file:
 1476
 1477                # Use the appropriate file handler based on the input format
 1478                if input_compressed:
 1479                    input_file = bgzf.open(self.input, "rt")
 1480                if input_format in ["vcf"]:
 1481                    header_len = self.get_header_length()
 1482                else:
 1483                    header_len = 0
 1484
 1485                # Insert the file contents into a table
 1486                self.insert_file_to_table(
 1487                    input_file,
 1488                    columns=sql_create_table_columns_list_sql,
 1489                    header_len=header_len,
 1490                    sep=delimiter,
 1491                    chunksize=chunksize,
 1492                )
 1493
 1494        else:
 1495            log.error(
 1496                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1497            )
 1498            raise ValueError(
 1499                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1500            )
 1501
 1502        # Explode INFOS fields into table fields
 1503        if self.get_explode_infos():
 1504            self.explode_infos(
 1505                prefix=self.get_explode_infos_prefix(),
 1506                fields=self.get_explode_infos_fields(),
 1507                force=True,
 1508            )
 1509
 1510        # Create index after insertion
 1511        self.create_indexes()
 1512
 1513    def get_explode_infos(self) -> bool:
 1514        """
 1515        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1516        to False if it is not set.
 1517        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1518        value. If the parameter is not present, it will return False.
 1519        """
 1520
 1521        return self.get_param().get("explode", {}).get("explode_infos", False)
 1522
 1523    def get_explode_infos_fields(
 1524        self,
 1525        explode_infos_fields: str = None,
 1526        remove_fields_not_in_header: bool = False,
 1527    ) -> list:
 1528        """
 1529        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1530        the input parameter `explode_infos_fields`.
 1531
 1532        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1533        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1534        comma-separated list of field names to explode
 1535        :type explode_infos_fields: str
 1536        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1537        flag that determines whether to remove fields that are not present in the header. If it is set
 1538        to `True`, any field that is not in the header will be excluded from the list of exploded
 1539        information fields. If it is set to `, defaults to False
 1540        :type remove_fields_not_in_header: bool (optional)
 1541        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1542        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1543        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1544        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1545        splitting the string by commas.
 1546        """
 1547
 1548        # If no fields, get it in param
 1549        if not explode_infos_fields:
 1550            explode_infos_fields = (
 1551                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1552            )
 1553
 1554        # If no fields, defined as all fields in header using keyword
 1555        if not explode_infos_fields:
 1556            explode_infos_fields = "*"
 1557
 1558        # If fields list not empty
 1559        if explode_infos_fields:
 1560
 1561            # Input fields list
 1562            if isinstance(explode_infos_fields, str):
 1563                fields_input = explode_infos_fields.split(",")
 1564            elif isinstance(explode_infos_fields, list):
 1565                fields_input = explode_infos_fields
 1566            else:
 1567                fields_input = []
 1568
 1569            # Fields list without * keyword
 1570            fields_without_all = fields_input.copy()
 1571            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1572                fields_without_all.remove("*")
 1573
 1574            # Fields in header
 1575            fields_in_header = sorted(list(set(self.get_header().infos)))
 1576
 1577            # Construct list of fields
 1578            fields_output = []
 1579            for field in fields_input:
 1580
 1581                # Strip field
 1582                field = field.strip()
 1583
 1584                # format keyword * in regex
 1585                if field.upper() in ["*"]:
 1586                    field = ".*"
 1587
 1588                # Find all fields with pattern
 1589                r = re.compile(rf"^{field}$")
 1590                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1591
 1592                # Remove fields input from search
 1593                if field in fields_search:
 1594                    fields_search = [field]
 1595                elif fields_search != [field]:
 1596                    fields_search = sorted(
 1597                        list(set(fields_search).difference(fields_input))
 1598                    )
 1599
 1600                # If field is not in header (avoid not well formatted header)
 1601                if not fields_search and not remove_fields_not_in_header:
 1602                    fields_search = [field]
 1603
 1604                # Add found fields
 1605                for new_field in fields_search:
 1606                    # Add field, if not already exists, and if it is in header (if asked)
 1607                    if (
 1608                        new_field not in fields_output
 1609                        and (
 1610                            not remove_fields_not_in_header
 1611                            or new_field in fields_in_header
 1612                        )
 1613                        and new_field not in [".*"]
 1614                    ):
 1615                        fields_output.append(new_field)
 1616
 1617            return fields_output
 1618
 1619        else:
 1620
 1621            return []
 1622
 1623    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1624        """
 1625        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1626        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1627        not provided.
 1628
 1629        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1630        prefix to be used for exploding or expanding information
 1631        :type explode_infos_prefix: str
 1632        :return: the value of the variable `explode_infos_prefix`.
 1633        """
 1634
 1635        if not explode_infos_prefix:
 1636            explode_infos_prefix = (
 1637                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1638            )
 1639
 1640        return explode_infos_prefix
 1641
 1642    def add_column(
 1643        self,
 1644        table_name,
 1645        column_name,
 1646        column_type,
 1647        default_value=None,
 1648        drop: bool = False,
 1649    ) -> dict:
 1650        """
 1651        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1652        doesn't already exist.
 1653
 1654        :param table_name: The name of the table to which you want to add a column
 1655        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1656        to the table
 1657        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1658        want to add to the table. It should be a string that represents the desired data type, such as
 1659        "INTEGER", "TEXT", "REAL", etc
 1660        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1661        default value for the newly added column. If a default value is provided, it will be assigned to
 1662        the column for any existing rows that do not have a value for that column
 1663        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1664        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1665        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1666        to False
 1667        :type drop: bool (optional)
 1668        :return: a boolean value indicating whether the column was successfully added to the table.
 1669        """
 1670
 1671        # added
 1672        added = False
 1673        dropped = False
 1674
 1675        # Check if the column already exists in the table
 1676        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1677        columns = self.get_query_to_df(query).columns.tolist()
 1678        if column_name.upper() in [c.upper() for c in columns]:
 1679            log.debug(
 1680                f"The {column_name} column already exists in the {table_name} table"
 1681            )
 1682            if drop:
 1683                self.drop_column(table_name=table_name, column_name=column_name)
 1684                dropped = True
 1685            else:
 1686                return None
 1687        else:
 1688            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1689
 1690        # Add column in table
 1691        add_column_query = (
 1692            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1693        )
 1694        if default_value is not None:
 1695            add_column_query += f" DEFAULT {default_value}"
 1696        self.execute_query(add_column_query)
 1697        added = not dropped
 1698        log.debug(
 1699            f"The {column_name} column was successfully added to the {table_name} table"
 1700        )
 1701
 1702        if added:
 1703            added_column = {
 1704                "table_name": table_name,
 1705                "column_name": column_name,
 1706                "column_type": column_type,
 1707                "default_value": default_value,
 1708            }
 1709        else:
 1710            added_column = None
 1711
 1712        return added_column
 1713
 1714    def drop_column(
 1715        self, column: dict = None, table_name: str = None, column_name: str = None
 1716    ) -> bool:
 1717        """
 1718        The `drop_column` function drops a specified column from a given table in a database and returns
 1719        True if the column was successfully dropped, and False if the column does not exist in the
 1720        table.
 1721
 1722        :param column: The `column` parameter is a dictionary that contains information about the column
 1723        you want to drop. It has two keys:
 1724        :type column: dict
 1725        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1726        drop a column
 1727        :type table_name: str
 1728        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1729        from the table
 1730        :type column_name: str
 1731        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1732        and False if the column does not exist in the table.
 1733        """
 1734
 1735        # Find column infos
 1736        if column:
 1737            if isinstance(column, dict):
 1738                table_name = column.get("table_name", None)
 1739                column_name = column.get("column_name", None)
 1740            elif isinstance(column, str):
 1741                table_name = self.get_table_variants()
 1742                column_name = column
 1743            else:
 1744                table_name = None
 1745                column_name = None
 1746
 1747        if not table_name and not column_name:
 1748            return False
 1749
 1750        # Removed
 1751        removed = False
 1752
 1753        # Check if the column already exists in the table
 1754        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1755        columns = self.get_query_to_df(query).columns.tolist()
 1756        if column_name in columns:
 1757            log.debug(f"The {column_name} column exists in the {table_name} table")
 1758        else:
 1759            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1760            return False
 1761
 1762        # Add column in table # ALTER TABLE integers DROP k
 1763        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1764        self.execute_query(add_column_query)
 1765        removed = True
 1766        log.debug(
 1767            f"The {column_name} column was successfully dropped to the {table_name} table"
 1768        )
 1769
 1770        return removed
 1771
 1772    def explode_infos(
 1773        self,
 1774        prefix: str = None,
 1775        create_index: bool = False,
 1776        fields: list = None,
 1777        force: bool = False,
 1778        proccess_all_fields_together: bool = False,
 1779        table: str = None,
 1780    ) -> list:
 1781        """
 1782        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1783        individual columns, returning a list of added columns.
 1784
 1785        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1786        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1787        `self.get_explode_infos_prefix()` as the prefix
 1788        :type prefix: str
 1789        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1790        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1791        `False`, indexes will not be created. The default value is `False`, defaults to False
 1792        :type create_index: bool (optional)
 1793        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1794        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1795        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1796        a list to the `
 1797        :type fields: list
 1798        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1799        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1800        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1801        defaults to False
 1802        :type force: bool (optional)
 1803        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1804        flag that determines whether to process all the INFO fields together or individually. If set to
 1805        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1806        be processed individually. The default value is, defaults to False
 1807        :type proccess_all_fields_together: bool (optional)
 1808        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1809        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1810        a value for the `table` parameter, the function will use that table name. If the `table`
 1811        parameter is
 1812        :type table: str
 1813        :return: The `explode_infos` function returns a list of added columns.
 1814        """
 1815
 1816        # drop indexes
 1817        self.drop_indexes()
 1818
 1819        # connexion format
 1820        connexion_format = self.get_connexion_format()
 1821
 1822        # Access
 1823        access = self.get_config().get("access", None)
 1824
 1825        # Added columns
 1826        added_columns = []
 1827
 1828        if access not in ["RO"]:
 1829
 1830            # prefix
 1831            if prefix in [None, True] or not isinstance(prefix, str):
 1832                if self.get_explode_infos_prefix() not in [None, True]:
 1833                    prefix = self.get_explode_infos_prefix()
 1834                else:
 1835                    prefix = "INFO/"
 1836
 1837            # table variants
 1838            if table is not None:
 1839                table_variants = table
 1840            else:
 1841                table_variants = self.get_table_variants(clause="select")
 1842
 1843            # extra infos
 1844            try:
 1845                extra_infos = self.get_extra_infos()
 1846            except:
 1847                extra_infos = []
 1848
 1849            # Header infos
 1850            header_infos = self.get_header().infos
 1851
 1852            log.debug(
 1853                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1854            )
 1855
 1856            sql_info_alter_table_array = []
 1857
 1858            # Info fields to check
 1859            fields_list = list(header_infos)
 1860            if fields:
 1861                fields_list += fields
 1862            fields_list = set(fields_list)
 1863
 1864            # If no fields
 1865            if not fields:
 1866                fields = []
 1867
 1868            # Translate fields if patterns
 1869            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1870
 1871            for info in fields:
 1872
 1873                info_id_sql = prefix + info
 1874
 1875                if (
 1876                    info in fields_list
 1877                    or prefix + info in fields_list
 1878                    or info in extra_infos
 1879                ):
 1880
 1881                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1882
 1883                    if info in header_infos:
 1884                        info_type = header_infos[info].type
 1885                        info_num = header_infos[info].num
 1886                    else:
 1887                        info_type = "String"
 1888                        info_num = 0
 1889
 1890                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1891                    if info_num != 1:
 1892                        type_sql = "VARCHAR"
 1893
 1894                    # Add field
 1895                    added_column = self.add_column(
 1896                        table_name=table_variants,
 1897                        column_name=info_id_sql,
 1898                        column_type=type_sql,
 1899                        default_value="null",
 1900                        drop=force,
 1901                    )
 1902
 1903                    if added_column:
 1904                        added_columns.append(added_column)
 1905
 1906                    if added_column or force:
 1907
 1908                        # add field to index
 1909                        self.index_additionnal_fields.append(info_id_sql)
 1910
 1911                        # Update field array
 1912                        if connexion_format in ["duckdb"]:
 1913                            update_info_field = f"""
 1914                            "{info_id_sql}" =
 1915                                CASE
 1916                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1917                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1918                                END
 1919                            """
 1920                        elif connexion_format in ["sqlite"]:
 1921                            update_info_field = f"""
 1922                                "{info_id_sql}" =
 1923                                    CASE
 1924                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1925                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1926                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1927                                    END
 1928                            """
 1929
 1930                        sql_info_alter_table_array.append(update_info_field)
 1931
 1932            if sql_info_alter_table_array:
 1933
 1934                # By chromosomes
 1935                try:
 1936                    chromosomes_list = list(
 1937                        self.get_query_to_df(
 1938                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1939                        )["#CHROM"]
 1940                    )
 1941                except:
 1942                    chromosomes_list = [None]
 1943
 1944                for chrom in chromosomes_list:
 1945                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1946
 1947                    # Where clause
 1948                    where_clause = ""
 1949                    if chrom and len(chromosomes_list) > 1:
 1950                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1951
 1952                    # Update table
 1953                    if proccess_all_fields_together:
 1954                        sql_info_alter_table_array_join = ", ".join(
 1955                            sql_info_alter_table_array
 1956                        )
 1957                        if sql_info_alter_table_array_join:
 1958                            sql_info_alter_table = f"""
 1959                                UPDATE {table_variants}
 1960                                SET {sql_info_alter_table_array_join}
 1961                                {where_clause}
 1962                                """
 1963                            log.debug(
 1964                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1965                            )
 1966                            # log.debug(sql_info_alter_table)
 1967                            self.conn.execute(sql_info_alter_table)
 1968                    else:
 1969                        sql_info_alter_num = 0
 1970                        for sql_info_alter in sql_info_alter_table_array:
 1971                            sql_info_alter_num += 1
 1972                            sql_info_alter_table = f"""
 1973                                UPDATE {table_variants}
 1974                                SET {sql_info_alter}
 1975                                {where_clause}
 1976                                """
 1977                            log.debug(
 1978                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1979                            )
 1980                            # log.debug(sql_info_alter_table)
 1981                            self.conn.execute(sql_info_alter_table)
 1982
 1983        # create indexes
 1984        if create_index:
 1985            self.create_indexes()
 1986
 1987        return added_columns
 1988
 1989    def create_indexes(self) -> None:
 1990        """
 1991        Create indexes on the table after insertion
 1992        """
 1993
 1994        # Access
 1995        access = self.get_config().get("access", None)
 1996
 1997        # get table variants
 1998        table_variants = self.get_table_variants("FROM")
 1999
 2000        if self.get_indexing() and access not in ["RO"]:
 2001            # Create index
 2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 2003            self.conn.execute(sql_create_table_index)
 2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 2005            self.conn.execute(sql_create_table_index)
 2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 2007            self.conn.execute(sql_create_table_index)
 2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 2009            self.conn.execute(sql_create_table_index)
 2010            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 2011            self.conn.execute(sql_create_table_index)
 2012            for field in self.index_additionnal_fields:
 2013                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 2014                self.conn.execute(sql_create_table_index)
 2015
 2016    def drop_indexes(self) -> None:
 2017        """
 2018        Create indexes on the table after insertion
 2019        """
 2020
 2021        # Access
 2022        access = self.get_config().get("access", None)
 2023
 2024        # get table variants
 2025        table_variants = self.get_table_variants("FROM")
 2026
 2027        # Get database format
 2028        connexion_format = self.get_connexion_format()
 2029
 2030        if access not in ["RO"]:
 2031            if connexion_format in ["duckdb"]:
 2032                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2033            elif connexion_format in ["sqlite"]:
 2034                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2035
 2036            list_indexes = self.conn.execute(sql_list_indexes)
 2037            index_names = [row[0] for row in list_indexes.fetchall()]
 2038            for index in index_names:
 2039                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2040                self.conn.execute(sql_drop_table_index)
 2041
 2042    def read_vcf_header(self, f) -> list:
 2043        """
 2044        It reads the header of a VCF file and returns a list of the header lines
 2045
 2046        :param f: the file object
 2047        :return: The header lines of the VCF file.
 2048        """
 2049
 2050        header_list = []
 2051        for line in f:
 2052            header_list.append(line)
 2053            if line.startswith("#CHROM"):
 2054                break
 2055        return header_list
 2056
 2057    def read_vcf_header_file(self, file: str = None) -> list:
 2058        """
 2059        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2060        uncompressed files.
 2061
 2062        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2063        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2064        default to `None`
 2065        :type file: str
 2066        :return: The function `read_vcf_header_file` returns a list.
 2067        """
 2068
 2069        if self.get_input_compressed(input_file=file):
 2070            with bgzf.open(file, "rt") as f:
 2071                return self.read_vcf_header(f=f)
 2072        else:
 2073            with open(file, "rt") as f:
 2074                return self.read_vcf_header(f=f)
 2075
 2076    def execute_query(self, query: str):
 2077        """
 2078        It takes a query as an argument, executes it, and returns the results
 2079
 2080        :param query: The query to be executed
 2081        :return: The result of the query is being returned.
 2082        """
 2083        if query:
 2084            return self.conn.execute(query)  # .fetchall()
 2085        else:
 2086            return None
 2087
 2088    def export_output(
 2089        self,
 2090        output_file: str | None = None,
 2091        output_header: str | None = None,
 2092        export_header: bool = True,
 2093        query: str | None = None,
 2094        parquet_partitions: list | None = None,
 2095        chunk_size: int | None = None,
 2096        threads: int | None = None,
 2097        sort: bool = False,
 2098        index: bool = False,
 2099        order_by: str | None = None,
 2100        fields_to_rename: dict | None = None,
 2101    ) -> bool:
 2102        """
 2103        The `export_output` function exports data from a VCF file to various formats, including VCF,
 2104        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
 2105        partitioning.
 2106
 2107        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2108        output file where the exported data will be saved
 2109        :type output_file: str | None
 2110        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2111        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2112        header will be exported to a file with the same name as the `output_file` parameter, but with
 2113        the extension "
 2114        :type output_header: str | None
 2115        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2116        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2117        True, the header will be exported to a file. If `export_header` is False, the header will not
 2118        be, defaults to True
 2119        :type export_header: bool (optional)
 2120        :param query: The `query` parameter in the `export_output` function is an optional SQL query
 2121        that can be used to filter and select specific data from the VCF file before exporting it. If
 2122        provided, only the data that matches the query will be exported. This allows you to customize
 2123        the exported data based on
 2124        :type query: str | None
 2125        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2126        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2127        organize data in a hierarchical directory structure based on the values of one or more columns.
 2128        This can improve query performance when working with large datasets
 2129        :type parquet_partitions: list | None
 2130        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
 2131        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
 2132        multiple files. It helps in optimizing the export process by breaking down the data into
 2133        manageable chunks for processing and storage
 2134        :type chunk_size: int | None
 2135        :param threads: The `threads` parameter in the `export_output` function specifies the number of
 2136        threads to be used during the export process. It determines the level of parallelism and can
 2137        improve the performance of the export operation. If this parameter is not provided, the function
 2138        will use the default number of threads
 2139        :type threads: int | None
 2140        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
 2141        determines whether the output file should be sorted based on genomic coordinates of the
 2142        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
 2143        `False`,, defaults to False
 2144        :type sort: bool (optional)
 2145        :param index: The `index` parameter in the `export_output` function is a boolean flag that
 2146        determines whether an index should be created on the output file. If `index` is set to `True`,
 2147        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
 2148        :type index: bool (optional)
 2149        :param order_by: The `order_by` parameter in the `export_output` function is a string that
 2150        specifies the column(s) to use for sorting the output file. This parameter is only applicable
 2151        when exporting data in VCF format. It allows you to specify the column(s) based on which the
 2152        output file should be
 2153        :type order_by: str | None
 2154        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
 2155        mapping of field names to be renamed during the export process. This parameter allows you to
 2156        customize the output field names before exporting the data. Each key-value pair in the
 2157        dictionary represents the original field name as the key and the new field name
 2158        :type fields_to_rename: dict | None
 2159        :return: The `export_output` function returns a boolean value. It checks if the output file
 2160        exists and returns True if it does, or None if it doesn't.
 2161        """
 2162
 2163        # Log
 2164        log.info("Exporting...")
 2165
 2166        # Full path
 2167        output_file = full_path(output_file)
 2168        output_header = full_path(output_header)
 2169
 2170        # Config
 2171        config = self.get_config()
 2172
 2173        # Param
 2174        param = self.get_param()
 2175
 2176        # Tmp files to remove
 2177        tmp_to_remove = []
 2178
 2179        # If no output, get it
 2180        if not output_file:
 2181            output_file = self.get_output()
 2182
 2183        # If not threads
 2184        if not threads:
 2185            threads = self.get_threads()
 2186
 2187        # Rename fields
 2188        if not fields_to_rename:
 2189            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
 2190        self.rename_info_fields(fields_to_rename=fields_to_rename)
 2191
 2192        # Auto header name with extension
 2193        if export_header or output_header:
 2194            if not output_header:
 2195                output_header = f"{output_file}.hdr"
 2196            # Export header
 2197            self.export_header(output_file=output_file)
 2198
 2199        # Switch off export header if VCF output
 2200        output_file_type = get_file_format(output_file)
 2201        if output_file_type in ["vcf"]:
 2202            export_header = False
 2203            tmp_to_remove.append(output_header)
 2204
 2205        # Chunk size
 2206        if not chunk_size:
 2207            chunk_size = config.get("chunk_size", None)
 2208
 2209        # Parquet partition
 2210        if not parquet_partitions:
 2211            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2212        if parquet_partitions and isinstance(parquet_partitions, str):
 2213            parquet_partitions = parquet_partitions.split(",")
 2214
 2215        # Order by
 2216        if not order_by:
 2217            order_by = param.get("export", {}).get("order_by", "")
 2218
 2219        # Header in output
 2220        header_in_output = param.get("export", {}).get("include_header", False)
 2221
 2222        # Database
 2223        database_source = self.get_connexion()
 2224
 2225        # Connexion format
 2226        connexion_format = self.get_connexion_format()
 2227
 2228        # Explode infos
 2229        if self.get_explode_infos():
 2230            self.explode_infos(
 2231                prefix=self.get_explode_infos_prefix(),
 2232                fields=self.get_explode_infos_fields(),
 2233                force=False,
 2234            )
 2235
 2236        # if connexion_format in ["sqlite"] or query:
 2237        if connexion_format in ["sqlite"]:
 2238
 2239            # Export in Parquet
 2240            random_tmp = "".join(
 2241                random.choice(string.ascii_lowercase) for i in range(10)
 2242            )
 2243            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2244            tmp_to_remove.append(database_source)
 2245
 2246            # Table Variants
 2247            table_variants = self.get_table_variants()
 2248
 2249            # Create export query
 2250            sql_query_export_subquery = f"""
 2251                SELECT * FROM {table_variants}
 2252                """
 2253
 2254            # Write source file
 2255            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2256
 2257        # Create database
 2258        database = Database(
 2259            database=database_source,
 2260            table="variants",
 2261            header_file=output_header,
 2262            conn_config=self.get_connexion_config(),
 2263        )
 2264
 2265        # Existing colomns header
 2266        existing_columns_header = database.get_header_columns_from_database(query=query)
 2267
 2268        # Sample list
 2269        if output_file_type in ["vcf"]:
 2270            get_samples = self.get_samples()
 2271            get_samples_check = self.get_samples_check()
 2272            samples_force = get_samples is not None
 2273            sample_list = self.get_header_sample_list(
 2274                check=get_samples_check,
 2275                samples=get_samples,
 2276                samples_force=samples_force,
 2277            )
 2278        else:
 2279            sample_list = None
 2280
 2281        # Export file
 2282        database.export(
 2283            output_database=output_file,
 2284            output_header=output_header,
 2285            existing_columns_header=existing_columns_header,
 2286            parquet_partitions=parquet_partitions,
 2287            chunk_size=chunk_size,
 2288            threads=threads,
 2289            sort=sort,
 2290            index=index,
 2291            header_in_output=header_in_output,
 2292            order_by=order_by,
 2293            query=query,
 2294            export_header=export_header,
 2295            sample_list=sample_list,
 2296        )
 2297
 2298        # Remove
 2299        remove_if_exists(tmp_to_remove)
 2300
 2301        return (os.path.exists(output_file) or None) and (
 2302            os.path.exists(output_file) or None
 2303        )
 2304
 2305    def get_extra_infos(self, table: str = None) -> list:
 2306        """
 2307        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2308        in the header.
 2309
 2310        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2311        name of the table from which you want to retrieve the extra columns that are not present in the
 2312        header. If the `table` parameter is not provided when calling the function, it will default to
 2313        using the variants
 2314        :type table: str
 2315        :return: A list of columns that are in the specified table but not in the header of the table.
 2316        """
 2317
 2318        header_columns = []
 2319
 2320        if not table:
 2321            table = self.get_table_variants(clause="from")
 2322            header_columns = self.get_header_columns()
 2323
 2324        # Check all columns in the database
 2325        query = f""" SELECT * FROM {table} LIMIT 1 """
 2326        log.debug(f"query {query}")
 2327        table_columns = self.get_query_to_df(query).columns.tolist()
 2328        extra_columns = []
 2329
 2330        # Construct extra infos (not in header)
 2331        for column in table_columns:
 2332            if column not in header_columns:
 2333                extra_columns.append(column)
 2334
 2335        return extra_columns
 2336
 2337    def get_extra_infos_sql(self, table: str = None) -> str:
 2338        """
 2339        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2340        by double quotes
 2341
 2342        :param table: The name of the table to get the extra infos from. If None, the default table is
 2343        used
 2344        :type table: str
 2345        :return: A string of the extra infos
 2346        """
 2347
 2348        return ", ".join(
 2349            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2350        )
 2351
 2352    def export_header(
 2353        self,
 2354        header_name: str = None,
 2355        output_file: str = None,
 2356        output_file_ext: str = ".hdr",
 2357        clean_header: bool = True,
 2358        remove_chrom_line: bool = False,
 2359    ) -> str:
 2360        """
 2361        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2362        specified options, and writes it to a new file.
 2363
 2364        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2365        this parameter is not specified, the header will be written to the output file
 2366        :type header_name: str
 2367        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2368        specify the name of the output file where the header will be written. If this parameter is not
 2369        provided, the header will be written to a temporary file
 2370        :type output_file: str
 2371        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2372        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2373        if not specified by the user. This extension will be appended to the `output_file` name to
 2374        create the final, defaults to .hdr
 2375        :type output_file_ext: str (optional)
 2376        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2377        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2378        `True`, the function will clean the header by modifying certain lines based on a specific
 2379        pattern. If `clean_header`, defaults to True
 2380        :type clean_header: bool (optional)
 2381        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2382        boolean flag that determines whether the #CHROM line should be removed from the header before
 2383        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2384        defaults to False
 2385        :type remove_chrom_line: bool (optional)
 2386        :return: The function `export_header` returns the name of the temporary header file that is
 2387        created.
 2388        """
 2389
 2390        if not header_name and not output_file:
 2391            output_file = self.get_output()
 2392
 2393        if self.get_header():
 2394
 2395            # Get header object
 2396            header_obj = self.get_header()
 2397
 2398            # Create database
 2399            db_for_header = Database(database=self.get_input())
 2400
 2401            # Get real columns in the file
 2402            db_header_columns = db_for_header.get_columns()
 2403
 2404            with tempfile.TemporaryDirectory() as tmpdir:
 2405
 2406                # Write header file
 2407                header_file_tmp = os.path.join(tmpdir, "header")
 2408                f = open(header_file_tmp, "w")
 2409                vcf.Writer(f, header_obj)
 2410                f.close()
 2411
 2412                # Replace #CHROM line with rel columns
 2413                header_list = db_for_header.read_header_file(
 2414                    header_file=header_file_tmp
 2415                )
 2416                header_list[-1] = "\t".join(db_header_columns)
 2417
 2418                # Remove CHROM line
 2419                if remove_chrom_line:
 2420                    header_list.pop()
 2421
 2422                # Clean header
 2423                if clean_header:
 2424                    header_list_clean = []
 2425                    for head in header_list:
 2426                        # Clean head for malformed header
 2427                        head_clean = head
 2428                        head_clean = re.subn(
 2429                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2430                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2431                            head_clean,
 2432                            2,
 2433                        )[0]
 2434                        # Write header
 2435                        header_list_clean.append(head_clean)
 2436                    header_list = header_list_clean
 2437
 2438            tmp_header_name = output_file + output_file_ext
 2439
 2440            f = open(tmp_header_name, "w")
 2441            for line in header_list:
 2442                f.write(line)
 2443            f.close()
 2444
 2445        return tmp_header_name
 2446
 2447    def export_variant_vcf(
 2448        self,
 2449        vcf_file,
 2450        remove_info: bool = False,
 2451        add_samples: bool = True,
 2452        list_samples: list = [],
 2453        where_clause: str = "",
 2454        index: bool = False,
 2455        threads: int | None = None,
 2456    ) -> bool | None:
 2457        """
 2458        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2459        remove INFO field, add samples, and control compression and indexing.
 2460
 2461        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2462        written to. It is the output file that will contain the filtered VCF data based on the specified
 2463        parameters
 2464        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2465        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2466        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2467        in, defaults to False
 2468        :type remove_info: bool (optional)
 2469        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2470        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2471        If set to False, the samples will be removed. The default value is True, defaults to True
 2472        :type add_samples: bool (optional)
 2473        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2474        in the output VCF file. By default, all samples will be included. If you provide a list of
 2475        samples, only those samples will be included in the output file
 2476        :type list_samples: list
 2477        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2478        determines whether or not to create an index for the output VCF file. If `index` is set to
 2479        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2480        :type index: bool (optional)
 2481        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2482        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2483        will be used during the export process. More threads can potentially speed up the export process
 2484        by utilizing multiple cores of the processor. If
 2485        :type threads: int | None
 2486        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2487        method with various parameters including the output file, query, threads, sort flag, and index
 2488        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2489        specified parameters and configurations provided in the `export_variant_vcf` function.
 2490        """
 2491
 2492        # Config
 2493        config = self.get_config()
 2494
 2495        # Extract VCF
 2496        log.debug("Export VCF...")
 2497
 2498        # Table variants
 2499        table_variants = self.get_table_variants()
 2500
 2501        # Threads
 2502        if not threads:
 2503            threads = self.get_threads()
 2504
 2505        # Info fields
 2506        if remove_info:
 2507            if not isinstance(remove_info, str):
 2508                remove_info = "."
 2509            info_field = f"""'{remove_info}' as INFO"""
 2510        else:
 2511            info_field = "INFO"
 2512
 2513        # Samples fields
 2514        if add_samples:
 2515            if not list_samples:
 2516                list_samples = self.get_header_sample_list()
 2517            if list_samples:
 2518                samples_fields = " , FORMAT , " + " , ".join(
 2519                    [f""" "{sample}" """ for sample in list_samples]
 2520                )
 2521            else:
 2522                samples_fields = ""
 2523            log.debug(f"samples_fields: {samples_fields}")
 2524        else:
 2525            samples_fields = ""
 2526
 2527        # Where clause
 2528        if where_clause is None:
 2529            where_clause = ""
 2530
 2531        # Variants
 2532        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2533        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2534        log.debug(f"sql_query_select={sql_query_select}")
 2535
 2536        return self.export_output(
 2537            output_file=vcf_file,
 2538            output_header=None,
 2539            export_header=True,
 2540            query=sql_query_select,
 2541            parquet_partitions=None,
 2542            chunk_size=config.get("chunk_size", None),
 2543            threads=threads,
 2544            sort=True,
 2545            index=index,
 2546            order_by=None,
 2547        )
 2548
 2549    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2550        """
 2551        It takes a list of commands and runs them in parallel using the number of threads specified
 2552
 2553        :param commands: A list of commands to run
 2554        :param threads: The number of threads to use, defaults to 1 (optional)
 2555        """
 2556
 2557        run_parallel_commands(commands, threads)
 2558
 2559    def get_threads(self, default: int = 1) -> int:
 2560        """
 2561        This function returns the number of threads to use for a job, with a default value of 1 if not
 2562        specified.
 2563
 2564        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2565        default number of threads to use if no specific value is provided. If no value is provided for
 2566        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2567        used, defaults to 1
 2568        :type default: int (optional)
 2569        :return: the number of threads to use for the current job.
 2570        """
 2571
 2572        # Config
 2573        config = self.get_config()
 2574
 2575        # Param
 2576        param = self.get_param()
 2577
 2578        # Input threads
 2579        input_thread = param.get("threads", config.get("threads", None))
 2580
 2581        # Check threads
 2582        if not input_thread:
 2583            threads = default
 2584        elif int(input_thread) <= 0:
 2585            threads = os.cpu_count()
 2586        else:
 2587            threads = int(input_thread)
 2588        return threads
 2589
 2590    def get_memory(self, default: str = None) -> str:
 2591        """
 2592        This function retrieves the memory value from parameters or configuration with a default value
 2593        if not found.
 2594
 2595        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2596        default value is used as a fallback in case the `memory` parameter is not provided in the
 2597        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2598        the function
 2599        :type default: str
 2600        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2601        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2602        return the default value provided as an argument to the function.
 2603        """
 2604
 2605        # Config
 2606        config = self.get_config()
 2607
 2608        # Param
 2609        param = self.get_param()
 2610
 2611        # Input threads
 2612        input_memory = param.get("memory", config.get("memory", None))
 2613
 2614        # Check threads
 2615        if input_memory:
 2616            memory = input_memory
 2617        else:
 2618            memory = default
 2619
 2620        return memory
 2621
 2622    def update_from_vcf(self, vcf_file: str) -> None:
 2623        """
 2624        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2625
 2626        :param vcf_file: the path to the VCF file
 2627        """
 2628
 2629        connexion_format = self.get_connexion_format()
 2630
 2631        if connexion_format in ["duckdb"]:
 2632            self.update_from_vcf_duckdb(vcf_file)
 2633        elif connexion_format in ["sqlite"]:
 2634            self.update_from_vcf_sqlite(vcf_file)
 2635
 2636    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2637        """
 2638        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2639        INFO column of the VCF file
 2640
 2641        :param vcf_file: the path to the VCF file
 2642        """
 2643
 2644        # varaints table
 2645        table_variants = self.get_table_variants()
 2646
 2647        # Loading VCF into temporaire table
 2648        skip = self.get_header_length(file=vcf_file)
 2649        vcf_df = pd.read_csv(
 2650            vcf_file,
 2651            sep="\t",
 2652            engine="c",
 2653            skiprows=skip,
 2654            header=0,
 2655            low_memory=False,
 2656        )
 2657        sql_query_update = f"""
 2658        UPDATE {table_variants} as table_variants
 2659            SET INFO = concat(
 2660                            CASE
 2661                                WHEN INFO NOT IN ('', '.')
 2662                                THEN INFO
 2663                                ELSE ''
 2664                            END,
 2665                            (
 2666                                SELECT 
 2667                                    concat(
 2668                                        CASE
 2669                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2670                                            THEN ';'
 2671                                            ELSE ''
 2672                                        END
 2673                                        ,
 2674                                        CASE
 2675                                            WHEN table_parquet.INFO NOT IN ('','.')
 2676                                            THEN table_parquet.INFO
 2677                                            ELSE ''
 2678                                        END
 2679                                    )
 2680                                FROM vcf_df as table_parquet
 2681                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2682                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2683                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2684                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2685                                        AND table_parquet.INFO NOT IN ('','.')
 2686                            )
 2687                        )
 2688            ;
 2689            """
 2690        self.conn.execute(sql_query_update)
 2691
 2692    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2693        """
 2694        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2695        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2696        table
 2697
 2698        :param vcf_file: The path to the VCF file you want to update the database with
 2699        """
 2700
 2701        # Create a temporary table for the VCF
 2702        table_vcf = "tmp_vcf"
 2703        sql_create = (
 2704            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2705        )
 2706        self.conn.execute(sql_create)
 2707
 2708        # Loading VCF into temporaire table
 2709        vcf_df = pd.read_csv(
 2710            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2711        )
 2712        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2713        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2714
 2715        # Update table 'variants' with VCF data
 2716        # warning: CONCAT as || operator
 2717        sql_query_update = f"""
 2718            UPDATE variants as table_variants
 2719            SET INFO = CASE
 2720                            WHEN INFO NOT IN ('', '.')
 2721                            THEN INFO
 2722                            ELSE ''
 2723                        END ||
 2724                        (
 2725                        SELECT 
 2726                            CASE 
 2727                                WHEN table_variants.INFO NOT IN ('','.') 
 2728                                    AND table_vcf.INFO NOT IN ('','.')  
 2729                                THEN ';' 
 2730                                ELSE '' 
 2731                            END || 
 2732                            CASE 
 2733                                WHEN table_vcf.INFO NOT IN ('','.') 
 2734                                THEN table_vcf.INFO 
 2735                                ELSE '' 
 2736                            END
 2737                        FROM {table_vcf} as table_vcf
 2738                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2739                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2740                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2741                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2742                        )
 2743        """
 2744        self.conn.execute(sql_query_update)
 2745
 2746        # Drop temporary table
 2747        sql_drop = f"DROP TABLE {table_vcf}"
 2748        self.conn.execute(sql_drop)
 2749
 2750    def drop_variants_table(self) -> None:
 2751        """
 2752        > This function drops the variants table
 2753        """
 2754
 2755        table_variants = self.get_table_variants()
 2756        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2757        self.conn.execute(sql_table_variants)
 2758
 2759    def set_variant_id(
 2760        self, variant_id_column: str = "variant_id", force: bool = None
 2761    ) -> str:
 2762        """
 2763        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2764        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2765
 2766        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2767        to variant_id
 2768        :type variant_id_column: str (optional)
 2769        :param force: If True, the variant_id column will be created even if it already exists
 2770        :type force: bool
 2771        :return: The name of the column that contains the variant_id
 2772        """
 2773
 2774        # Assembly
 2775        assembly = self.get_param().get(
 2776            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2777        )
 2778
 2779        # INFO/Tag prefix
 2780        prefix = self.get_explode_infos_prefix()
 2781
 2782        # Explode INFO/SVTYPE
 2783        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2784
 2785        # variants table
 2786        table_variants = self.get_table_variants()
 2787
 2788        # variant_id column
 2789        if not variant_id_column:
 2790            variant_id_column = "variant_id"
 2791
 2792        # Creta variant_id column
 2793        if "variant_id" not in self.get_extra_infos() or force:
 2794
 2795            # Create column
 2796            self.add_column(
 2797                table_name=table_variants,
 2798                column_name=variant_id_column,
 2799                column_type="UBIGINT",
 2800                default_value="0",
 2801            )
 2802
 2803            # Update column
 2804            self.conn.execute(
 2805                f"""
 2806                    UPDATE {table_variants}
 2807                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2808                """
 2809            )
 2810
 2811        # Remove added columns
 2812        for added_column in added_columns:
 2813            self.drop_column(column=added_column)
 2814
 2815        # return variant_id column name
 2816        return variant_id_column
 2817
 2818    def get_variant_id_column(
 2819        self, variant_id_column: str = "variant_id", force: bool = None
 2820    ) -> str:
 2821        """
 2822        This function returns the variant_id column name
 2823
 2824        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2825        defaults to variant_id
 2826        :type variant_id_column: str (optional)
 2827        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2828        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2829        if it is not already set, or if it is set
 2830        :type force: bool
 2831        :return: The variant_id column name.
 2832        """
 2833
 2834        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2835
 2836    ###
 2837    # Annotation
 2838    ###
 2839
 2840    def scan_databases(
 2841        self,
 2842        database_formats: list = ["parquet"],
 2843        database_releases: list = ["current"],
 2844    ) -> dict:
 2845        """
 2846        The function `scan_databases` scans for available databases based on specified formats and
 2847        releases.
 2848
 2849        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2850        of the databases to be scanned. In this case, the accepted format is "parquet"
 2851        :type database_formats: list ["parquet"]
 2852        :param database_releases: The `database_releases` parameter is a list that specifies the
 2853        releases of the databases to be scanned. In the provided function, the default value for
 2854        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2855        databases that are in the "current"
 2856        :type database_releases: list
 2857        :return: The function `scan_databases` returns a dictionary containing information about
 2858        databases that match the specified formats and releases.
 2859        """
 2860
 2861        # Config
 2862        config = self.get_config()
 2863
 2864        # Param
 2865        param = self.get_param()
 2866
 2867        # Param - Assembly
 2868        assembly = param.get("assembly", config.get("assembly", None))
 2869        if not assembly:
 2870            assembly = DEFAULT_ASSEMBLY
 2871            log.warning(f"Default assembly '{assembly}'")
 2872
 2873        # Scan for availabled databases
 2874        log.info(
 2875            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2876        )
 2877        databases_infos_dict = databases_infos(
 2878            database_folder_releases=database_releases,
 2879            database_formats=database_formats,
 2880            assembly=assembly,
 2881            config=config,
 2882        )
 2883        log.info(
 2884            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2885        )
 2886
 2887        return databases_infos_dict
 2888
 2889    def annotation(self) -> None:
 2890        """
 2891        It annotates the VCF file with the annotations specified in the config file.
 2892        """
 2893
 2894        # Config
 2895        config = self.get_config()
 2896
 2897        # Param
 2898        param = self.get_param()
 2899
 2900        # Param - Assembly
 2901        assembly = param.get("assembly", config.get("assembly", None))
 2902        if not assembly:
 2903            assembly = DEFAULT_ASSEMBLY
 2904            log.warning(f"Default assembly '{assembly}'")
 2905
 2906        # annotations databases folders
 2907        annotations_databases = set(
 2908            config.get("folders", {})
 2909            .get("databases", {})
 2910            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2911            + config.get("folders", {})
 2912            .get("databases", {})
 2913            .get("parquet", ["~/howard/databases/parquet/current"])
 2914            + config.get("folders", {})
 2915            .get("databases", {})
 2916            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2917        )
 2918
 2919        # Get param annotations
 2920        if param.get("annotations", None) and isinstance(
 2921            param.get("annotations", None), str
 2922        ):
 2923            log.debug(param.get("annotations", None))
 2924            param_annotation_list = param.get("annotations").split(",")
 2925        else:
 2926            param_annotation_list = []
 2927
 2928        # Each tools param
 2929        if param.get("annotation_parquet", None) != None:
 2930            log.debug(
 2931                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2932            )
 2933            if isinstance(param.get("annotation_parquet", None), list):
 2934                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2935            else:
 2936                param_annotation_list.append(param.get("annotation_parquet"))
 2937        if param.get("annotation_snpsift", None) != None:
 2938            if isinstance(param.get("annotation_snpsift", None), list):
 2939                param_annotation_list.append(
 2940                    "snpsift:"
 2941                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2942                )
 2943            else:
 2944                param_annotation_list.append(
 2945                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2946                )
 2947        if param.get("annotation_snpeff", None) != None:
 2948            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2949        if param.get("annotation_bcftools", None) != None:
 2950            if isinstance(param.get("annotation_bcftools", None), list):
 2951                param_annotation_list.append(
 2952                    "bcftools:"
 2953                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2954                )
 2955            else:
 2956                param_annotation_list.append(
 2957                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2958                )
 2959        if param.get("annotation_annovar", None) != None:
 2960            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2961        if param.get("annotation_exomiser", None) != None:
 2962            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2963        if param.get("annotation_splice", None) != None:
 2964            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2965
 2966        # Merge param annotations list
 2967        param["annotations"] = ",".join(param_annotation_list)
 2968
 2969        # debug
 2970        log.debug(f"param_annotations={param['annotations']}")
 2971
 2972        if param.get("annotations"):
 2973
 2974            # Log
 2975            # log.info("Annotations - Check annotation parameters")
 2976
 2977            if not "annotation" in param:
 2978                param["annotation"] = {}
 2979
 2980            # List of annotations parameters
 2981            annotations_list_input = {}
 2982            if isinstance(param.get("annotations", None), str):
 2983                annotation_file_list = [
 2984                    value for value in param.get("annotations", "").split(",")
 2985                ]
 2986                for annotation_file in annotation_file_list:
 2987                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
 2988            else:
 2989                annotations_list_input = param.get("annotations", {})
 2990
 2991            log.info(f"Quick Annotations:")
 2992            for annotation_key in list(annotations_list_input.keys()):
 2993                log.info(f"   {annotation_key}")
 2994
 2995            # List of annotations and associated fields
 2996            annotations_list = {}
 2997
 2998            for annotation_file in annotations_list_input:
 2999
 3000                # Explode annotations if ALL
 3001                if (
 3002                    annotation_file.upper() == "ALL"
 3003                    or annotation_file.upper().startswith("ALL:")
 3004                ):
 3005
 3006                    # check ALL parameters (formats, releases)
 3007                    annotation_file_split = annotation_file.split(":")
 3008                    database_formats = "parquet"
 3009                    database_releases = "current"
 3010                    for annotation_file_option in annotation_file_split[1:]:
 3011                        database_all_options_split = annotation_file_option.split("=")
 3012                        if database_all_options_split[0] == "format":
 3013                            database_formats = database_all_options_split[1].split("+")
 3014                        if database_all_options_split[0] == "release":
 3015                            database_releases = database_all_options_split[1].split("+")
 3016
 3017                    # Scan for availabled databases
 3018                    databases_infos_dict = self.scan_databases(
 3019                        database_formats=database_formats,
 3020                        database_releases=database_releases,
 3021                    )
 3022
 3023                    # Add found databases in annotation parameters
 3024                    for database_infos in databases_infos_dict.keys():
 3025                        annotations_list[database_infos] = {"INFO": None}
 3026
 3027                else:
 3028                    annotations_list[annotation_file] = annotations_list_input[
 3029                        annotation_file
 3030                    ]
 3031
 3032            # Check each databases
 3033            if len(annotations_list):
 3034
 3035                log.info(
 3036                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3037                )
 3038
 3039                for annotation_file in annotations_list:
 3040
 3041                    # Init
 3042                    annotations = annotations_list.get(annotation_file, None)
 3043
 3044                    # Annotation snpEff
 3045                    if annotation_file.startswith("snpeff"):
 3046
 3047                        log.debug(f"Quick Annotation snpEff")
 3048
 3049                        if "snpeff" not in param["annotation"]:
 3050                            param["annotation"]["snpeff"] = {}
 3051
 3052                        if "options" not in param["annotation"]["snpeff"]:
 3053                            param["annotation"]["snpeff"]["options"] = ""
 3054
 3055                        # snpEff options in annotations
 3056                        param["annotation"]["snpeff"]["options"] = "".join(
 3057                            annotation_file.split(":")[1:]
 3058                        )
 3059
 3060                    # Annotation Annovar
 3061                    elif annotation_file.startswith("annovar"):
 3062
 3063                        log.debug(f"Quick Annotation Annovar")
 3064
 3065                        if "annovar" not in param["annotation"]:
 3066                            param["annotation"]["annovar"] = {}
 3067
 3068                        if "annotations" not in param["annotation"]["annovar"]:
 3069                            param["annotation"]["annovar"]["annotations"] = {}
 3070
 3071                        # Options
 3072                        annotation_file_split = annotation_file.split(":")
 3073                        for annotation_file_annotation in annotation_file_split[1:]:
 3074                            if annotation_file_annotation:
 3075                                param["annotation"]["annovar"]["annotations"][
 3076                                    annotation_file_annotation
 3077                                ] = annotations
 3078
 3079                    # Annotation Exomiser
 3080                    elif annotation_file.startswith("exomiser"):
 3081
 3082                        log.debug(f"Quick Annotation Exomiser")
 3083
 3084                        param["annotation"]["exomiser"] = params_string_to_dict(
 3085                            annotation_file
 3086                        )
 3087
 3088                    # Annotation Splice
 3089                    elif annotation_file.startswith("splice"):
 3090
 3091                        log.debug(f"Quick Annotation Splice")
 3092
 3093                        param["annotation"]["splice"] = params_string_to_dict(
 3094                            annotation_file
 3095                        )
 3096
 3097                    # Annotation Parquet or BCFTOOLS
 3098                    else:
 3099
 3100                        # Tools detection
 3101                        if annotation_file.startswith("bcftools:"):
 3102                            annotation_tool_initial = "bcftools"
 3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3104                        elif annotation_file.startswith("snpsift:"):
 3105                            annotation_tool_initial = "snpsift"
 3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3107                        elif annotation_file.startswith("bigwig:"):
 3108                            annotation_tool_initial = "bigwig"
 3109                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3110                        else:
 3111                            annotation_tool_initial = None
 3112
 3113                        # list of files
 3114                        annotation_file_list = annotation_file.replace("+", ":").split(
 3115                            ":"
 3116                        )
 3117
 3118                        for annotation_file in annotation_file_list:
 3119
 3120                            if annotation_file:
 3121
 3122                                # Annotation tool initial
 3123                                annotation_tool = annotation_tool_initial
 3124
 3125                                # Find file
 3126                                annotation_file_found = None
 3127
 3128                                if os.path.exists(annotation_file):
 3129                                    annotation_file_found = annotation_file
 3130                                elif os.path.exists(full_path(annotation_file)):
 3131                                    annotation_file_found = full_path(annotation_file)
 3132                                else:
 3133                                    # Find within assembly folders
 3134                                    for annotations_database in annotations_databases:
 3135                                        found_files = find_all(
 3136                                            annotation_file,
 3137                                            os.path.join(
 3138                                                annotations_database, assembly
 3139                                            ),
 3140                                        )
 3141                                        if len(found_files) > 0:
 3142                                            annotation_file_found = found_files[0]
 3143                                            break
 3144                                    if not annotation_file_found and not assembly:
 3145                                        # Find within folders
 3146                                        for (
 3147                                            annotations_database
 3148                                        ) in annotations_databases:
 3149                                            found_files = find_all(
 3150                                                annotation_file, annotations_database
 3151                                            )
 3152                                            if len(found_files) > 0:
 3153                                                annotation_file_found = found_files[0]
 3154                                                break
 3155                                log.debug(
 3156                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3157                                )
 3158
 3159                                # Full path
 3160                                annotation_file_found = full_path(annotation_file_found)
 3161
 3162                                if annotation_file_found:
 3163
 3164                                    database = Database(database=annotation_file_found)
 3165                                    quick_annotation_format = database.get_format()
 3166                                    quick_annotation_is_compressed = (
 3167                                        database.is_compressed()
 3168                                    )
 3169                                    quick_annotation_is_indexed = os.path.exists(
 3170                                        f"{annotation_file_found}.tbi"
 3171                                    )
 3172                                    bcftools_preference = False
 3173
 3174                                    # Check Annotation Tool
 3175                                    if not annotation_tool:
 3176                                        if (
 3177                                            bcftools_preference
 3178                                            and quick_annotation_format
 3179                                            in ["vcf", "bed"]
 3180                                            and quick_annotation_is_compressed
 3181                                            and quick_annotation_is_indexed
 3182                                        ):
 3183                                            annotation_tool = "bcftools"
 3184                                        elif quick_annotation_format in [
 3185                                            "vcf",
 3186                                            "bed",
 3187                                            "tsv",
 3188                                            "tsv",
 3189                                            "csv",
 3190                                            "json",
 3191                                            "tbl",
 3192                                            "parquet",
 3193                                            "duckdb",
 3194                                        ]:
 3195                                            annotation_tool = "parquet"
 3196                                        elif quick_annotation_format in ["bw"]:
 3197                                            annotation_tool = "bigwig"
 3198                                        else:
 3199                                            log.error(
 3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3201                                            )
 3202                                            raise ValueError(
 3203                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3204                                            )
 3205
 3206                                    log.debug(
 3207                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3208                                    )
 3209
 3210                                    # Annotation Tool dispatch
 3211                                    if annotation_tool:
 3212                                        if annotation_tool not in param["annotation"]:
 3213                                            param["annotation"][annotation_tool] = {}
 3214                                        if (
 3215                                            "annotations"
 3216                                            not in param["annotation"][annotation_tool]
 3217                                        ):
 3218                                            param["annotation"][annotation_tool][
 3219                                                "annotations"
 3220                                            ] = {}
 3221                                        param["annotation"][annotation_tool][
 3222                                            "annotations"
 3223                                        ][annotation_file_found] = annotations
 3224
 3225                                else:
 3226                                    log.warning(
 3227                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3228                                    )
 3229
 3230                self.set_param(param)
 3231
 3232        if param.get("annotation", None):
 3233            log.info("Annotations")
 3234            if param.get("annotation", {}).get("parquet", None):
 3235                log.info("Annotations 'parquet'...")
 3236                self.annotation_parquet()
 3237            if param.get("annotation", {}).get("bcftools", None):
 3238                log.info("Annotations 'bcftools'...")
 3239                self.annotation_bcftools()
 3240            if param.get("annotation", {}).get("snpsift", None):
 3241                log.info("Annotations 'snpsift'...")
 3242                self.annotation_snpsift()
 3243            if param.get("annotation", {}).get("bigwig", None):
 3244                log.info("Annotations 'bigwig'...")
 3245                self.annotation_bigwig()
 3246            if param.get("annotation", {}).get("annovar", None):
 3247                log.info("Annotations 'annovar'...")
 3248                self.annotation_annovar()
 3249            if param.get("annotation", {}).get("snpeff", None):
 3250                log.info("Annotations 'snpeff'...")
 3251                self.annotation_snpeff()
 3252            if param.get("annotation", {}).get("exomiser", None) is not None:
 3253                log.info("Annotations 'exomiser'...")
 3254                self.annotation_exomiser()
 3255            if param.get("annotation", {}).get("splice", None) is not None:
 3256                log.info("Annotations 'splice' ...")
 3257                self.annotation_splice()
 3258
 3259        # Explode INFOS fields into table fields
 3260        if self.get_explode_infos():
 3261            self.explode_infos(
 3262                prefix=self.get_explode_infos_prefix(),
 3263                fields=self.get_explode_infos_fields(),
 3264                force=True,
 3265            )
 3266
 3267    def annotation_bigwig(self, threads: int = None) -> None:
 3268        """
 3269        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
 3270
 3271        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
 3272        number of threads to be used for parallel processing during the annotation process. If the
 3273        `threads` parameter is not provided, the method will attempt to determine the optimal number of
 3274        threads to use based on the system configuration
 3275        :type threads: int
 3276        :return: True
 3277        """
 3278
 3279        # DEBUG
 3280        log.debug("Start annotation with bigwig databases")
 3281
 3282        # # Threads
 3283        # if not threads:
 3284        #     threads = self.get_threads()
 3285        # log.debug("Threads: " + str(threads))
 3286
 3287        # Config
 3288        config = self.get_config()
 3289        log.debug("Config: " + str(config))
 3290
 3291        # Config - BCFTools databases folders
 3292        databases_folders = set(
 3293            self.get_config()
 3294            .get("folders", {})
 3295            .get("databases", {})
 3296            .get("annotations", ["."])
 3297            + self.get_config()
 3298            .get("folders", {})
 3299            .get("databases", {})
 3300            .get("bigwig", ["."])
 3301        )
 3302        log.debug("Databases annotations: " + str(databases_folders))
 3303
 3304        # Param
 3305        annotations = (
 3306            self.get_param()
 3307            .get("annotation", {})
 3308            .get("bigwig", {})
 3309            .get("annotations", None)
 3310        )
 3311        log.debug("Annotations: " + str(annotations))
 3312
 3313        # Assembly
 3314        assembly = self.get_param().get(
 3315            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3316        )
 3317
 3318        # Data
 3319        table_variants = self.get_table_variants()
 3320
 3321        # Check if not empty
 3322        log.debug("Check if not empty")
 3323        sql_query_chromosomes = (
 3324            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3325        )
 3326        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3327        if not sql_query_chromosomes_df["count"][0]:
 3328            log.info(f"VCF empty")
 3329            return
 3330
 3331        # VCF header
 3332        vcf_reader = self.get_header()
 3333        log.debug("Initial header: " + str(vcf_reader.infos))
 3334
 3335        # Existing annotations
 3336        for vcf_annotation in self.get_header().infos:
 3337
 3338            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3339            log.debug(
 3340                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3341            )
 3342
 3343        if annotations:
 3344
 3345            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3346
 3347                # Export VCF file
 3348                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3349
 3350                # annotation_bigwig_config
 3351                annotation_bigwig_config_list = []
 3352
 3353                for annotation in annotations:
 3354                    annotation_fields = annotations[annotation]
 3355
 3356                    # Annotation Name
 3357                    annotation_name = os.path.basename(annotation)
 3358
 3359                    if not annotation_fields:
 3360                        annotation_fields = {"INFO": None}
 3361
 3362                    log.debug(f"Annotation '{annotation_name}'")
 3363                    log.debug(
 3364                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3365                    )
 3366
 3367                    # Create Database
 3368                    database = Database(
 3369                        database=annotation,
 3370                        databases_folders=databases_folders,
 3371                        assembly=assembly,
 3372                    )
 3373
 3374                    # Find files
 3375                    db_file = database.get_database()
 3376                    db_file = full_path(db_file)
 3377                    db_hdr_file = database.get_header_file()
 3378                    db_hdr_file = full_path(db_hdr_file)
 3379                    db_file_type = database.get_format()
 3380
 3381                    # If db_file is http ?
 3382                    if database.get_database().startswith("http"):
 3383
 3384                        # Datbase is HTTP URL
 3385                        db_file_is_http = True
 3386
 3387                        # DB file keep as URL
 3388                        db_file = database.get_database()
 3389                        log.warning(
 3390                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
 3391                        )
 3392
 3393                        # Retrieve automatic annotation field name
 3394                        annotation_field = clean_annotation_field(
 3395                            os.path.basename(db_file).replace(".bw", "")
 3396                        )
 3397                        log.debug(
 3398                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
 3399                        )
 3400
 3401                        # Create automatic header file
 3402                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
 3403                        with open(db_hdr_file, "w") as f:
 3404                            f.write("##fileformat=VCFv4.2\n")
 3405                            f.write(
 3406                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
 3407                            )
 3408                            f.write(f"#CHROM	START	END	{annotation_field}\n")
 3409
 3410                    else:
 3411
 3412                        # Datbase is NOT HTTP URL
 3413                        db_file_is_http = False
 3414
 3415                    # Check index - try to create if not exists
 3416                    if (
 3417                        db_file is None
 3418                        or db_hdr_file is None
 3419                        or (not os.path.exists(db_file) and not db_file_is_http)
 3420                        or not os.path.exists(db_hdr_file)
 3421                        or not db_file_type in ["bw"]
 3422                    ):
 3423                        # if False:
 3424                        log.error("Annotation failed: database not valid")
 3425                        log.error(f"Annotation annotation file: {db_file}")
 3426                        log.error(f"Annotation annotation file type: {db_file_type}")
 3427                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3428                        raise ValueError(
 3429                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
 3430                        )
 3431                    else:
 3432
 3433                        # Log
 3434                        log.debug(
 3435                            f"Annotation '{annotation}' - file: "
 3436                            + str(db_file)
 3437                            + " and "
 3438                            + str(db_hdr_file)
 3439                        )
 3440
 3441                        # Load header as VCF object
 3442                        db_hdr_vcf = Variants(input=db_hdr_file)
 3443                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3444                        log.debug(
 3445                            "Annotation database header: "
 3446                            + str(db_hdr_vcf_header_infos)
 3447                        )
 3448
 3449                        # For all fields in database
 3450                        annotation_fields_full = False
 3451                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3452                            annotation_fields = {
 3453                                key: key for key in db_hdr_vcf_header_infos
 3454                            }
 3455                            log.debug(
 3456                                "Annotation database header - All annotations added: "
 3457                                + str(annotation_fields)
 3458                            )
 3459                            annotation_fields_full = True
 3460
 3461                        # Init
 3462                        cyvcf2_header_rename_dict = {}
 3463                        cyvcf2_header_list = []
 3464                        cyvcf2_header_indexes = {}
 3465
 3466                        # process annotation fields
 3467                        for annotation_field in annotation_fields:
 3468
 3469                            # New annotation name
 3470                            annotation_field_new = annotation_fields[annotation_field]
 3471
 3472                            # Check annotation field and index in header
 3473                            if (
 3474                                annotation_field
 3475                                in db_hdr_vcf.get_header_columns_as_list()
 3476                            ):
 3477                                annotation_field_index = (
 3478                                    db_hdr_vcf.get_header_columns_as_list().index(
 3479                                        annotation_field
 3480                                    )
 3481                                    - 3
 3482                                )
 3483                                cyvcf2_header_indexes[annotation_field_new] = (
 3484                                    annotation_field_index
 3485                                )
 3486                            else:
 3487                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
 3488                                log.error(msg_err)
 3489                                raise ValueError(msg_err)
 3490
 3491                            # Append annotation field in cyvcf2 header list
 3492                            cyvcf2_header_rename_dict[annotation_field_new] = (
 3493                                db_hdr_vcf_header_infos[annotation_field].id
 3494                            )
 3495                            cyvcf2_header_list.append(
 3496                                {
 3497                                    "ID": annotation_field_new,
 3498                                    "Number": db_hdr_vcf_header_infos[
 3499                                        annotation_field
 3500                                    ].num,
 3501                                    "Type": db_hdr_vcf_header_infos[
 3502                                        annotation_field
 3503                                    ].type,
 3504                                    "Description": db_hdr_vcf_header_infos[
 3505                                        annotation_field
 3506                                    ].desc,
 3507                                }
 3508                            )
 3509
 3510                            # Add header on VCF
 3511                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
 3512                                annotation_field_new,
 3513                                db_hdr_vcf_header_infos[annotation_field].num,
 3514                                db_hdr_vcf_header_infos[annotation_field].type,
 3515                                db_hdr_vcf_header_infos[annotation_field].desc,
 3516                                "HOWARD BigWig annotation",
 3517                                "unknown",
 3518                                self.code_type_map[
 3519                                    db_hdr_vcf_header_infos[annotation_field].type
 3520                                ],
 3521                            )
 3522
 3523                        # Load bigwig database
 3524                        bw_db = pyBigWig.open(db_file)
 3525                        if bw_db.isBigWig():
 3526                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
 3527                        else:
 3528                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
 3529                            log.error(msg_err)
 3530                            raise ValueError(msg_err)
 3531
 3532                        annotation_bigwig_config_list.append(
 3533                            {
 3534                                "db_file": db_file,
 3535                                "bw_db": bw_db,
 3536                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
 3537                                "cyvcf2_header_list": cyvcf2_header_list,
 3538                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
 3539                            }
 3540                        )
 3541
 3542                # Annotate
 3543                if annotation_bigwig_config_list:
 3544
 3545                    # Annotation config
 3546                    log.debug(
 3547                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
 3548                    )
 3549
 3550                    # Export VCF file
 3551                    self.export_variant_vcf(
 3552                        vcf_file=tmp_vcf_name,
 3553                        remove_info=True,
 3554                        add_samples=False,
 3555                        index=True,
 3556                    )
 3557
 3558                    # Load input tmp file
 3559                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
 3560
 3561                    # Add header in input file
 3562                    for annotation_bigwig_config in annotation_bigwig_config_list:
 3563                        for cyvcf2_header_field in annotation_bigwig_config.get(
 3564                            "cyvcf2_header_list", []
 3565                        ):
 3566                            log.info(
 3567                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
 3568                            )
 3569                            input_vcf.add_info_to_header(cyvcf2_header_field)
 3570
 3571                    # Create output VCF file
 3572                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
 3573                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
 3574
 3575                    # Fetch variants
 3576                    log.info(f"Annotations 'bigwig' start...")
 3577                    for variant in input_vcf:
 3578
 3579                        for annotation_bigwig_config in annotation_bigwig_config_list:
 3580
 3581                            # DB and indexes
 3582                            bw_db = annotation_bigwig_config.get("bw_db", None)
 3583                            cyvcf2_header_indexes = annotation_bigwig_config.get(
 3584                                "cyvcf2_header_indexes", None
 3585                            )
 3586
 3587                            # Retrieve value from chrom pos
 3588                            res = bw_db.values(
 3589                                variant.CHROM, variant.POS - 1, variant.POS
 3590                            )
 3591
 3592                            # For each annotation fields (and indexes)
 3593                            for cyvcf2_header_index in cyvcf2_header_indexes:
 3594
 3595                                # If value is NOT nNone
 3596                                if not np.isnan(
 3597                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
 3598                                ):
 3599                                    variant.INFO[cyvcf2_header_index] = res[
 3600                                        cyvcf2_header_indexes[cyvcf2_header_index]
 3601                                    ]
 3602
 3603                        # Add record in output file
 3604                        output_vcf.write_record(variant)
 3605
 3606                    # Log
 3607                    log.debug(f"Annotation done.")
 3608
 3609                    # Close and write file
 3610                    log.info(f"Annotations 'bigwig' write...")
 3611                    output_vcf.close()
 3612                    log.debug(f"Write done.")
 3613
 3614                    # Update variants
 3615                    log.info(f"Annotations 'bigwig' update...")
 3616                    self.update_from_vcf(output_vcf_file)
 3617                    log.debug(f"Update done.")
 3618
 3619        return True
 3620
 3621    def annotation_snpsift(self, threads: int = None) -> None:
 3622        """
 3623        This function annotate with bcftools
 3624
 3625        :param threads: Number of threads to use
 3626        :return: the value of the variable "return_value".
 3627        """
 3628
 3629        # DEBUG
 3630        log.debug("Start annotation with bcftools databases")
 3631
 3632        # Threads
 3633        if not threads:
 3634            threads = self.get_threads()
 3635        log.debug("Threads: " + str(threads))
 3636
 3637        # Config
 3638        config = self.get_config()
 3639        log.debug("Config: " + str(config))
 3640
 3641        # Config - snpSift
 3642        snpsift_bin_command = get_bin_command(
 3643            bin="SnpSift.jar",
 3644            tool="snpsift",
 3645            bin_type="jar",
 3646            config=config,
 3647            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3648        )
 3649        if not snpsift_bin_command:
 3650            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3651            log.error(msg_err)
 3652            raise ValueError(msg_err)
 3653
 3654        # Config - bcftools
 3655        bcftools_bin_command = get_bin_command(
 3656            bin="bcftools",
 3657            tool="bcftools",
 3658            bin_type="bin",
 3659            config=config,
 3660            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3661        )
 3662        if not bcftools_bin_command:
 3663            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3664            log.error(msg_err)
 3665            raise ValueError(msg_err)
 3666
 3667        # Config - BCFTools databases folders
 3668        databases_folders = set(
 3669            self.get_config()
 3670            .get("folders", {})
 3671            .get("databases", {})
 3672            .get("annotations", ["."])
 3673            + self.get_config()
 3674            .get("folders", {})
 3675            .get("databases", {})
 3676            .get("bcftools", ["."])
 3677        )
 3678        log.debug("Databases annotations: " + str(databases_folders))
 3679
 3680        # Param
 3681        annotations = (
 3682            self.get_param()
 3683            .get("annotation", {})
 3684            .get("snpsift", {})
 3685            .get("annotations", None)
 3686        )
 3687        log.debug("Annotations: " + str(annotations))
 3688
 3689        # Assembly
 3690        assembly = self.get_param().get(
 3691            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3692        )
 3693
 3694        # Data
 3695        table_variants = self.get_table_variants()
 3696
 3697        # Check if not empty
 3698        log.debug("Check if not empty")
 3699        sql_query_chromosomes = (
 3700            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3701        )
 3702        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3703        if not sql_query_chromosomes_df["count"][0]:
 3704            log.info(f"VCF empty")
 3705            return
 3706
 3707        # VCF header
 3708        vcf_reader = self.get_header()
 3709        log.debug("Initial header: " + str(vcf_reader.infos))
 3710
 3711        # Existing annotations
 3712        for vcf_annotation in self.get_header().infos:
 3713
 3714            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3715            log.debug(
 3716                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3717            )
 3718
 3719        if annotations:
 3720
 3721            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3722
 3723                # Export VCF file
 3724                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3725
 3726                # Init
 3727                commands = {}
 3728
 3729                for annotation in annotations:
 3730                    annotation_fields = annotations[annotation]
 3731
 3732                    # Annotation Name
 3733                    annotation_name = os.path.basename(annotation)
 3734
 3735                    if not annotation_fields:
 3736                        annotation_fields = {"INFO": None}
 3737
 3738                    log.debug(f"Annotation '{annotation_name}'")
 3739                    log.debug(
 3740                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3741                    )
 3742
 3743                    # Create Database
 3744                    database = Database(
 3745                        database=annotation,
 3746                        databases_folders=databases_folders,
 3747                        assembly=assembly,
 3748                    )
 3749
 3750                    # Find files
 3751                    db_file = database.get_database()
 3752                    db_file = full_path(db_file)
 3753                    db_hdr_file = database.get_header_file()
 3754                    db_hdr_file = full_path(db_hdr_file)
 3755                    db_file_type = database.get_format()
 3756                    db_tbi_file = f"{db_file}.tbi"
 3757                    db_file_compressed = database.is_compressed()
 3758
 3759                    # Check if compressed
 3760                    if not db_file_compressed:
 3761                        log.error(
 3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3763                        )
 3764                        raise ValueError(
 3765                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3766                        )
 3767
 3768                    # Check if indexed
 3769                    if not os.path.exists(db_tbi_file):
 3770                        log.error(
 3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3772                        )
 3773                        raise ValueError(
 3774                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3775                        )
 3776
 3777                    # Check index - try to create if not exists
 3778                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3779                        log.error("Annotation failed: database not valid")
 3780                        log.error(f"Annotation annotation file: {db_file}")
 3781                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3782                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3783                        raise ValueError(
 3784                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3785                        )
 3786                    else:
 3787
 3788                        log.debug(
 3789                            f"Annotation '{annotation}' - file: "
 3790                            + str(db_file)
 3791                            + " and "
 3792                            + str(db_hdr_file)
 3793                        )
 3794
 3795                        # Load header as VCF object
 3796                        db_hdr_vcf = Variants(input=db_hdr_file)
 3797                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3798                        log.debug(
 3799                            "Annotation database header: "
 3800                            + str(db_hdr_vcf_header_infos)
 3801                        )
 3802
 3803                        # For all fields in database
 3804                        annotation_fields_full = False
 3805                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3806                            annotation_fields = {
 3807                                key: key for key in db_hdr_vcf_header_infos
 3808                            }
 3809                            log.debug(
 3810                                "Annotation database header - All annotations added: "
 3811                                + str(annotation_fields)
 3812                            )
 3813                            annotation_fields_full = True
 3814
 3815                        # # Create file for field rename
 3816                        # log.debug("Create file for field rename")
 3817                        # tmp_rename = NamedTemporaryFile(
 3818                        #     prefix=self.get_prefix(),
 3819                        #     dir=self.get_tmp_dir(),
 3820                        #     suffix=".rename",
 3821                        #     delete=False,
 3822                        # )
 3823                        # tmp_rename_name = tmp_rename.name
 3824                        # tmp_files.append(tmp_rename_name)
 3825
 3826                        # Number of fields
 3827                        nb_annotation_field = 0
 3828                        annotation_list = []
 3829                        annotation_infos_rename_list = []
 3830
 3831                        for annotation_field in annotation_fields:
 3832
 3833                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3834                            annotation_fields_new_name = annotation_fields.get(
 3835                                annotation_field, annotation_field
 3836                            )
 3837                            if not annotation_fields_new_name:
 3838                                annotation_fields_new_name = annotation_field
 3839
 3840                            # Check if field is in DB and if field is not elready in input data
 3841                            if (
 3842                                annotation_field in db_hdr_vcf.get_header().infos
 3843                                and annotation_fields_new_name
 3844                                not in self.get_header().infos
 3845                            ):
 3846
 3847                                log.info(
 3848                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3849                                )
 3850
 3851                                # BCFTools annotate param to rename fields
 3852                                if annotation_field != annotation_fields_new_name:
 3853                                    annotation_infos_rename_list.append(
 3854                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3855                                    )
 3856
 3857                                # Add INFO field to header
 3858                                db_hdr_vcf_header_infos_number = (
 3859                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3860                                )
 3861                                db_hdr_vcf_header_infos_type = (
 3862                                    db_hdr_vcf_header_infos[annotation_field].type
 3863                                    or "String"
 3864                                )
 3865                                db_hdr_vcf_header_infos_description = (
 3866                                    db_hdr_vcf_header_infos[annotation_field].desc
 3867                                    or f"{annotation_field} description"
 3868                                )
 3869                                db_hdr_vcf_header_infos_source = (
 3870                                    db_hdr_vcf_header_infos[annotation_field].source
 3871                                    or "unknown"
 3872                                )
 3873                                db_hdr_vcf_header_infos_version = (
 3874                                    db_hdr_vcf_header_infos[annotation_field].version
 3875                                    or "unknown"
 3876                                )
 3877
 3878                                vcf_reader.infos[annotation_fields_new_name] = (
 3879                                    vcf.parser._Info(
 3880                                        annotation_fields_new_name,
 3881                                        db_hdr_vcf_header_infos_number,
 3882                                        db_hdr_vcf_header_infos_type,
 3883                                        db_hdr_vcf_header_infos_description,
 3884                                        db_hdr_vcf_header_infos_source,
 3885                                        db_hdr_vcf_header_infos_version,
 3886                                        self.code_type_map[
 3887                                            db_hdr_vcf_header_infos_type
 3888                                        ],
 3889                                    )
 3890                                )
 3891
 3892                                annotation_list.append(annotation_field)
 3893
 3894                                nb_annotation_field += 1
 3895
 3896                            else:
 3897
 3898                                if (
 3899                                    annotation_field
 3900                                    not in db_hdr_vcf.get_header().infos
 3901                                ):
 3902                                    log.warning(
 3903                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3904                                    )
 3905                                if (
 3906                                    annotation_fields_new_name
 3907                                    in self.get_header().infos
 3908                                ):
 3909                                    log.warning(
 3910                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3911                                    )
 3912
 3913                        log.info(
 3914                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3915                        )
 3916
 3917                        annotation_infos = ",".join(annotation_list)
 3918
 3919                        if annotation_infos != "":
 3920
 3921                            # Annotated VCF (and error file)
 3922                            tmp_annotation_vcf_name = os.path.join(
 3923                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3924                            )
 3925                            tmp_annotation_vcf_name_err = (
 3926                                tmp_annotation_vcf_name + ".err"
 3927                            )
 3928
 3929                            # Add fields to annotate
 3930                            if not annotation_fields_full:
 3931                                annotation_infos_option = f"-info {annotation_infos}"
 3932                            else:
 3933                                annotation_infos_option = ""
 3934
 3935                            # Info fields rename
 3936                            if annotation_infos_rename_list:
 3937                                annotation_infos_rename = " -c " + ",".join(
 3938                                    annotation_infos_rename_list
 3939                                )
 3940                            else:
 3941                                annotation_infos_rename = ""
 3942
 3943                            # Annotate command
 3944                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3945
 3946                            # Add command
 3947                            commands[command_annotate] = tmp_annotation_vcf_name
 3948
 3949                if commands:
 3950
 3951                    # Export VCF file
 3952                    self.export_variant_vcf(
 3953                        vcf_file=tmp_vcf_name,
 3954                        remove_info=True,
 3955                        add_samples=False,
 3956                        index=True,
 3957                    )
 3958                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3959
 3960                    # Num command
 3961                    nb_command = 0
 3962
 3963                    # Annotate
 3964                    for command_annotate in commands:
 3965                        nb_command += 1
 3966                        log.info(
 3967                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3968                        )
 3969                        log.debug(f"command_annotate={command_annotate}")
 3970                        run_parallel_commands([command_annotate], threads)
 3971
 3972                        # Debug
 3973                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3974
 3975                        # Update variants
 3976                        log.info(
 3977                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3978                        )
 3979                        self.update_from_vcf(commands[command_annotate])
 3980
 3981    def annotation_bcftools(self, threads: int = None) -> None:
 3982        """
 3983        This function annotate with bcftools
 3984
 3985        :param threads: Number of threads to use
 3986        :return: the value of the variable "return_value".
 3987        """
 3988
 3989        # DEBUG
 3990        log.debug("Start annotation with bcftools databases")
 3991
 3992        # Threads
 3993        if not threads:
 3994            threads = self.get_threads()
 3995        log.debug("Threads: " + str(threads))
 3996
 3997        # Config
 3998        config = self.get_config()
 3999        log.debug("Config: " + str(config))
 4000
 4001        # DEBUG
 4002        delete_tmp = True
 4003        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4004            delete_tmp = False
 4005            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4006
 4007        # Config - BCFTools bin command
 4008        bcftools_bin_command = get_bin_command(
 4009            bin="bcftools",
 4010            tool="bcftools",
 4011            bin_type="bin",
 4012            config=config,
 4013            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 4014        )
 4015        if not bcftools_bin_command:
 4016            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 4017            log.error(msg_err)
 4018            raise ValueError(msg_err)
 4019
 4020        # Config - BCFTools databases folders
 4021        databases_folders = set(
 4022            self.get_config()
 4023            .get("folders", {})
 4024            .get("databases", {})
 4025            .get("annotations", ["."])
 4026            + self.get_config()
 4027            .get("folders", {})
 4028            .get("databases", {})
 4029            .get("bcftools", ["."])
 4030        )
 4031        log.debug("Databases annotations: " + str(databases_folders))
 4032
 4033        # Param
 4034        annotations = (
 4035            self.get_param()
 4036            .get("annotation", {})
 4037            .get("bcftools", {})
 4038            .get("annotations", None)
 4039        )
 4040        log.debug("Annotations: " + str(annotations))
 4041
 4042        # Assembly
 4043        assembly = self.get_param().get(
 4044            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 4045        )
 4046
 4047        # Data
 4048        table_variants = self.get_table_variants()
 4049
 4050        # Check if not empty
 4051        log.debug("Check if not empty")
 4052        sql_query_chromosomes = (
 4053            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4054        )
 4055        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 4056        if not sql_query_chromosomes_df["count"][0]:
 4057            log.info(f"VCF empty")
 4058            return
 4059
 4060        # Export in VCF
 4061        log.debug("Create initial file to annotate")
 4062        tmp_vcf = NamedTemporaryFile(
 4063            prefix=self.get_prefix(),
 4064            dir=self.get_tmp_dir(),
 4065            suffix=".vcf.gz",
 4066            delete=False,
 4067        )
 4068        tmp_vcf_name = tmp_vcf.name
 4069
 4070        # VCF header
 4071        vcf_reader = self.get_header()
 4072        log.debug("Initial header: " + str(vcf_reader.infos))
 4073
 4074        # Existing annotations
 4075        for vcf_annotation in self.get_header().infos:
 4076
 4077            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4078            log.debug(
 4079                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4080            )
 4081
 4082        if annotations:
 4083
 4084            tmp_ann_vcf_list = []
 4085            commands = []
 4086            tmp_files = []
 4087            err_files = []
 4088
 4089            for annotation in annotations:
 4090                annotation_fields = annotations[annotation]
 4091
 4092                # Annotation Name
 4093                annotation_name = os.path.basename(annotation)
 4094
 4095                if not annotation_fields:
 4096                    annotation_fields = {"INFO": None}
 4097
 4098                log.debug(f"Annotation '{annotation_name}'")
 4099                log.debug(
 4100                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 4101                )
 4102
 4103                # Create Database
 4104                database = Database(
 4105                    database=annotation,
 4106                    databases_folders=databases_folders,
 4107                    assembly=assembly,
 4108                )
 4109
 4110                # Find files
 4111                db_file = database.get_database()
 4112                db_file = full_path(db_file)
 4113                db_hdr_file = database.get_header_file()
 4114                db_hdr_file = full_path(db_hdr_file)
 4115                db_file_type = database.get_format()
 4116                db_tbi_file = f"{db_file}.tbi"
 4117                db_file_compressed = database.is_compressed()
 4118
 4119                # Check if compressed
 4120                if not db_file_compressed:
 4121                    log.error(
 4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4123                    )
 4124                    raise ValueError(
 4125                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 4126                    )
 4127
 4128                # Check if indexed
 4129                if not os.path.exists(db_tbi_file):
 4130                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 4131                    raise ValueError(
 4132                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 4133                    )
 4134
 4135                # Check index - try to create if not exists
 4136                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 4137                    log.error("Annotation failed: database not valid")
 4138                    log.error(f"Annotation annotation file: {db_file}")
 4139                    log.error(f"Annotation annotation header: {db_hdr_file}")
 4140                    log.error(f"Annotation annotation index: {db_tbi_file}")
 4141                    raise ValueError(
 4142                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 4143                    )
 4144                else:
 4145
 4146                    log.debug(
 4147                        f"Annotation '{annotation}' - file: "
 4148                        + str(db_file)
 4149                        + " and "
 4150                        + str(db_hdr_file)
 4151                    )
 4152
 4153                    # Load header as VCF object
 4154                    db_hdr_vcf = Variants(input=db_hdr_file)
 4155                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 4156                    log.debug(
 4157                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 4158                    )
 4159
 4160                    # For all fields in database
 4161                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 4162                        annotation_fields = {
 4163                            key: key for key in db_hdr_vcf_header_infos
 4164                        }
 4165                        log.debug(
 4166                            "Annotation database header - All annotations added: "
 4167                            + str(annotation_fields)
 4168                        )
 4169
 4170                    # Number of fields
 4171                    nb_annotation_field = 0
 4172                    annotation_list = []
 4173
 4174                    for annotation_field in annotation_fields:
 4175
 4176                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 4177                        annotation_fields_new_name = annotation_fields.get(
 4178                            annotation_field, annotation_field
 4179                        )
 4180                        if not annotation_fields_new_name:
 4181                            annotation_fields_new_name = annotation_field
 4182
 4183                        # Check if field is in DB and if field is not elready in input data
 4184                        if (
 4185                            annotation_field in db_hdr_vcf.get_header().infos
 4186                            and annotation_fields_new_name
 4187                            not in self.get_header().infos
 4188                        ):
 4189
 4190                            log.info(
 4191                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 4192                            )
 4193
 4194                            # Add INFO field to header
 4195                            db_hdr_vcf_header_infos_number = (
 4196                                db_hdr_vcf_header_infos[annotation_field].num or "."
 4197                            )
 4198                            db_hdr_vcf_header_infos_type = (
 4199                                db_hdr_vcf_header_infos[annotation_field].type
 4200                                or "String"
 4201                            )
 4202                            db_hdr_vcf_header_infos_description = (
 4203                                db_hdr_vcf_header_infos[annotation_field].desc
 4204                                or f"{annotation_field} description"
 4205                            )
 4206                            db_hdr_vcf_header_infos_source = (
 4207                                db_hdr_vcf_header_infos[annotation_field].source
 4208                                or "unknown"
 4209                            )
 4210                            db_hdr_vcf_header_infos_version = (
 4211                                db_hdr_vcf_header_infos[annotation_field].version
 4212                                or "unknown"
 4213                            )
 4214
 4215                            vcf_reader.infos[annotation_fields_new_name] = (
 4216                                vcf.parser._Info(
 4217                                    annotation_fields_new_name,
 4218                                    db_hdr_vcf_header_infos_number,
 4219                                    db_hdr_vcf_header_infos_type,
 4220                                    db_hdr_vcf_header_infos_description,
 4221                                    db_hdr_vcf_header_infos_source,
 4222                                    db_hdr_vcf_header_infos_version,
 4223                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 4224                                )
 4225                            )
 4226
 4227                            # annotation_list.append(annotation_field)
 4228                            if annotation_field != annotation_fields_new_name:
 4229                                annotation_list.append(
 4230                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 4231                                )
 4232                            else:
 4233                                annotation_list.append(annotation_field)
 4234
 4235                            nb_annotation_field += 1
 4236
 4237                        else:
 4238
 4239                            if annotation_field not in db_hdr_vcf.get_header().infos:
 4240                                log.warning(
 4241                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 4242                                )
 4243                            if annotation_fields_new_name in self.get_header().infos:
 4244                                log.warning(
 4245                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 4246                                )
 4247
 4248                    log.info(
 4249                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 4250                    )
 4251
 4252                    annotation_infos = ",".join(annotation_list)
 4253
 4254                    if annotation_infos != "":
 4255
 4256                        # Protect header for bcftools (remove "#CHROM" and variants line)
 4257                        log.debug("Protect Header file - remove #CHROM line if exists")
 4258                        tmp_header_vcf = NamedTemporaryFile(
 4259                            prefix=self.get_prefix(),
 4260                            dir=self.get_tmp_dir(),
 4261                            suffix=".hdr",
 4262                            delete=False,
 4263                        )
 4264                        tmp_header_vcf_name = tmp_header_vcf.name
 4265                        tmp_files.append(tmp_header_vcf_name)
 4266                        # Command
 4267                        if db_hdr_file.endswith(".gz"):
 4268                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4269                        else:
 4270                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 4271                        # Run
 4272                        run_parallel_commands([command_extract_header], 1)
 4273
 4274                        # Find chomosomes
 4275                        log.debug("Find chromosomes ")
 4276                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 4277                        sql_query_chromosomes_df = self.get_query_to_df(
 4278                            sql_query_chromosomes
 4279                        )
 4280                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 4281
 4282                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 4283
 4284                        # BED columns in the annotation file
 4285                        if db_file_type in ["bed"]:
 4286                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 4287
 4288                        for chrom in chomosomes_list:
 4289
 4290                            # Create BED on initial VCF
 4291                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 4292                            tmp_bed = NamedTemporaryFile(
 4293                                prefix=self.get_prefix(),
 4294                                dir=self.get_tmp_dir(),
 4295                                suffix=".bed",
 4296                                delete=False,
 4297                            )
 4298                            tmp_bed_name = tmp_bed.name
 4299                            tmp_files.append(tmp_bed_name)
 4300
 4301                            # Detecte regions
 4302                            log.debug(
 4303                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 4304                            )
 4305                            window = 1000000
 4306                            sql_query_intervals_for_bed = f"""
 4307                                SELECT  \"#CHROM\",
 4308                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 4309                                        \"POS\"+{window}
 4310                                FROM {table_variants} as table_variants
 4311                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 4312                            """
 4313                            regions = self.conn.execute(
 4314                                sql_query_intervals_for_bed
 4315                            ).fetchall()
 4316                            merged_regions = merge_regions(regions)
 4317                            log.debug(
 4318                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 4319                            )
 4320
 4321                            header = ["#CHROM", "START", "END"]
 4322                            with open(tmp_bed_name, "w") as f:
 4323                                # Write the header with tab delimiter
 4324                                f.write("\t".join(header) + "\n")
 4325                                for d in merged_regions:
 4326                                    # Write each data row with tab delimiter
 4327                                    f.write("\t".join(map(str, d)) + "\n")
 4328
 4329                            # Tmp files
 4330                            tmp_annotation_vcf = NamedTemporaryFile(
 4331                                prefix=self.get_prefix(),
 4332                                dir=self.get_tmp_dir(),
 4333                                suffix=".vcf.gz",
 4334                                delete=False,
 4335                            )
 4336                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 4337                            tmp_files.append(tmp_annotation_vcf_name)
 4338                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 4339                            tmp_annotation_vcf_name_err = (
 4340                                tmp_annotation_vcf_name + ".err"
 4341                            )
 4342                            err_files.append(tmp_annotation_vcf_name_err)
 4343
 4344                            # Annotate Command
 4345                            log.debug(
 4346                                f"Annotation '{annotation}' - add bcftools command"
 4347                            )
 4348
 4349                            # Command
 4350                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 4351
 4352                            # Add command
 4353                            commands.append(command_annotate)
 4354
 4355            # if some commands
 4356            if commands:
 4357
 4358                # Export VCF file
 4359                self.export_variant_vcf(
 4360                    vcf_file=tmp_vcf_name,
 4361                    remove_info=True,
 4362                    add_samples=False,
 4363                    index=True,
 4364                )
 4365
 4366                # Threads
 4367                # calculate threads for annotated commands
 4368                if commands:
 4369                    threads_bcftools_annotate = round(threads / len(commands))
 4370                else:
 4371                    threads_bcftools_annotate = 1
 4372
 4373                if not threads_bcftools_annotate:
 4374                    threads_bcftools_annotate = 1
 4375
 4376                # Add threads option to bcftools commands
 4377                if threads_bcftools_annotate > 1:
 4378                    commands_threaded = []
 4379                    for command in commands:
 4380                        commands_threaded.append(
 4381                            command.replace(
 4382                                f"{bcftools_bin_command} annotate ",
 4383                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 4384                            )
 4385                        )
 4386                    commands = commands_threaded
 4387
 4388                # Command annotation multithreading
 4389                log.debug(f"Annotation - Annotation commands: " + str(commands))
 4390                log.info(
 4391                    f"Annotation - Annotation multithreaded in "
 4392                    + str(len(commands))
 4393                    + " commands"
 4394                )
 4395
 4396                run_parallel_commands(commands, threads)
 4397
 4398                # Merge
 4399                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4400
 4401                if tmp_ann_vcf_list_cmd:
 4402
 4403                    # Tmp file
 4404                    tmp_annotate_vcf = NamedTemporaryFile(
 4405                        prefix=self.get_prefix(),
 4406                        dir=self.get_tmp_dir(),
 4407                        suffix=".vcf.gz",
 4408                        delete=True,
 4409                    )
 4410                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4411                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4412                    err_files.append(tmp_annotate_vcf_name_err)
 4413
 4414                    # Tmp file remove command
 4415                    tmp_files_remove_command = ""
 4416                    if tmp_files:
 4417                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4418
 4419                    # Command merge
 4420                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4421                    log.info(
 4422                        f"Annotation - Annotation merging "
 4423                        + str(len(commands))
 4424                        + " annotated files"
 4425                    )
 4426                    log.debug(f"Annotation - merge command: {merge_command}")
 4427                    run_parallel_commands([merge_command], 1)
 4428
 4429                    # Error messages
 4430                    log.info(f"Error/Warning messages:")
 4431                    error_message_command_all = []
 4432                    error_message_command_warning = []
 4433                    error_message_command_err = []
 4434                    for err_file in err_files:
 4435                        with open(err_file, "r") as f:
 4436                            for line in f:
 4437                                message = line.strip()
 4438                                error_message_command_all.append(message)
 4439                                if line.startswith("[W::"):
 4440                                    error_message_command_warning.append(message)
 4441                                if line.startswith("[E::"):
 4442                                    error_message_command_err.append(
 4443                                        f"{err_file}: " + message
 4444                                    )
 4445                    # log info
 4446                    for message in list(
 4447                        set(error_message_command_err + error_message_command_warning)
 4448                    ):
 4449                        log.info(f"   {message}")
 4450                    # debug info
 4451                    for message in list(set(error_message_command_all)):
 4452                        log.debug(f"   {message}")
 4453                    # failed
 4454                    if len(error_message_command_err):
 4455                        log.error("Annotation failed: Error in commands")
 4456                        raise ValueError("Annotation failed: Error in commands")
 4457
 4458                    # Update variants
 4459                    log.info(f"Annotation - Updating...")
 4460                    self.update_from_vcf(tmp_annotate_vcf_name)
 4461
 4462    def annotation_exomiser(self, threads: int = None) -> None:
 4463        """
 4464        This function annotate with Exomiser
 4465
 4466        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4467        - "analysis" (dict/file):
 4468            Full analysis dictionnary parameters (see Exomiser docs).
 4469            Either a dict, or a file in JSON or YAML format.
 4470            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4471            Default : None
 4472        - "preset" (string):
 4473            Analysis preset (available in config folder).
 4474            Used if no full "analysis" is provided.
 4475            Default: "exome"
 4476        - "phenopacket" (dict/file):
 4477            Samples and phenotipic features parameters (see Exomiser docs).
 4478            Either a dict, or a file in JSON or YAML format.
 4479            Default: None
 4480        - "subject" (dict):
 4481            Sample parameters (see Exomiser docs).
 4482            Example:
 4483                "subject":
 4484                    {
 4485                        "id": "ISDBM322017",
 4486                        "sex": "FEMALE"
 4487                    }
 4488            Default: None
 4489        - "sample" (string):
 4490            Sample name to construct "subject" section:
 4491                "subject":
 4492                    {
 4493                        "id": "<sample>",
 4494                        "sex": "UNKNOWN_SEX"
 4495                    }
 4496            Default: None
 4497        - "phenotypicFeatures" (dict)
 4498            Phenotypic features to construct "subject" section.
 4499            Example:
 4500                "phenotypicFeatures":
 4501                    [
 4502                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4503                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4504                    ]
 4505        - "hpo" (list)
 4506            List of HPO ids as phenotypic features.
 4507            Example:
 4508                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4509            Default: []
 4510        - "outputOptions" (dict):
 4511            Output options (see Exomiser docs).
 4512            Default:
 4513                "output_options" =
 4514                    {
 4515                        "outputContributingVariantsOnly": False,
 4516                        "numGenes": 0,
 4517                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4518                    }
 4519        - "transcript_source" (string):
 4520            Transcript source (either "refseq", "ucsc", "ensembl")
 4521            Default: "refseq"
 4522        - "exomiser_to_info" (boolean):
 4523            Add exomiser TSV file columns as INFO fields in VCF.
 4524            Default: False
 4525        - "release" (string):
 4526            Exomise database release.
 4527            If not exists, database release will be downloaded (take a while).
 4528            Default: None (provided by application.properties configuration file)
 4529        - "exomiser_application_properties" (file):
 4530            Exomiser configuration file (see Exomiser docs).
 4531            Useful to automatically download databases (especially for specific genome databases).
 4532
 4533        Notes:
 4534        - If no sample in parameters, first sample in VCF will be chosen
 4535        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4536
 4537        :param threads: The number of threads to use
 4538        :return: None.
 4539        """
 4540
 4541        # DEBUG
 4542        log.debug("Start annotation with Exomiser databases")
 4543
 4544        # Threads
 4545        if not threads:
 4546            threads = self.get_threads()
 4547        log.debug("Threads: " + str(threads))
 4548
 4549        # Config
 4550        config = self.get_config()
 4551        log.debug("Config: " + str(config))
 4552
 4553        # Config - Folders - Databases
 4554        databases_folders = (
 4555            config.get("folders", {})
 4556            .get("databases", {})
 4557            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4558        )
 4559        databases_folders = full_path(databases_folders)
 4560        if not os.path.exists(databases_folders):
 4561            log.error(f"Databases annotations: {databases_folders} NOT found")
 4562        log.debug("Databases annotations: " + str(databases_folders))
 4563
 4564        # Config - Exomiser
 4565        exomiser_bin_command = get_bin_command(
 4566            bin="exomiser-cli*.jar",
 4567            tool="exomiser",
 4568            bin_type="jar",
 4569            config=config,
 4570            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4571        )
 4572        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4573        if not exomiser_bin_command:
 4574            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4575            log.error(msg_err)
 4576            raise ValueError(msg_err)
 4577
 4578        # Param
 4579        param = self.get_param()
 4580        log.debug("Param: " + str(param))
 4581
 4582        # Param - Exomiser
 4583        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4584        log.debug(f"Param Exomiser: {param_exomiser}")
 4585
 4586        # Param - Assembly
 4587        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4588        log.debug("Assembly: " + str(assembly))
 4589
 4590        # Data
 4591        table_variants = self.get_table_variants()
 4592
 4593        # Check if not empty
 4594        log.debug("Check if not empty")
 4595        sql_query_chromosomes = (
 4596            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4597        )
 4598        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4599            log.info(f"VCF empty")
 4600            return False
 4601
 4602        # VCF header
 4603        vcf_reader = self.get_header()
 4604        log.debug("Initial header: " + str(vcf_reader.infos))
 4605
 4606        # Samples
 4607        samples = self.get_header_sample_list()
 4608        if not samples:
 4609            log.error("No Samples in VCF")
 4610            return False
 4611        log.debug(f"Samples: {samples}")
 4612
 4613        # Memory limit
 4614        memory_limit = self.get_memory("8G")
 4615        log.debug(f"memory_limit: {memory_limit}")
 4616
 4617        # Exomiser java options
 4618        exomiser_java_options = (
 4619            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4620        )
 4621        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4622
 4623        # Download Exomiser (if not exists)
 4624        exomiser_release = param_exomiser.get("release", None)
 4625        exomiser_application_properties = param_exomiser.get(
 4626            "exomiser_application_properties", None
 4627        )
 4628        databases_download_exomiser(
 4629            assemblies=[assembly],
 4630            exomiser_folder=databases_folders,
 4631            exomiser_release=exomiser_release,
 4632            exomiser_phenotype_release=exomiser_release,
 4633            exomiser_application_properties=exomiser_application_properties,
 4634        )
 4635
 4636        # Force annotation
 4637        force_update_annotation = True
 4638
 4639        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4640            log.debug("Start annotation Exomiser")
 4641
 4642            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4643
 4644                # tmp_dir = "/tmp/exomiser"
 4645
 4646                ### ANALYSIS ###
 4647                ################
 4648
 4649                # Create analysis.json through analysis dict
 4650                # either analysis in param or by default
 4651                # depending on preset exome/genome)
 4652
 4653                # Init analysis dict
 4654                param_exomiser_analysis_dict = {}
 4655
 4656                # analysis from param
 4657                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4658                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4659
 4660                # If analysis in param -> load anlaysis json
 4661                if param_exomiser_analysis:
 4662
 4663                    # If param analysis is a file and exists
 4664                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4665                        param_exomiser_analysis
 4666                    ):
 4667                        # Load analysis file into analysis dict (either yaml or json)
 4668                        with open(param_exomiser_analysis) as json_file:
 4669                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4670
 4671                    # If param analysis is a dict
 4672                    elif isinstance(param_exomiser_analysis, dict):
 4673                        # Load analysis dict into analysis dict (either yaml or json)
 4674                        param_exomiser_analysis_dict = param_exomiser_analysis
 4675
 4676                    # Error analysis type
 4677                    else:
 4678                        log.error(f"Analysis type unknown. Check param file.")
 4679                        raise ValueError(f"Analysis type unknown. Check param file.")
 4680
 4681                # Case no input analysis config file/dict
 4682                # Use preset (exome/genome) to open default config file
 4683                if not param_exomiser_analysis_dict:
 4684
 4685                    # default preset
 4686                    default_preset = "exome"
 4687
 4688                    # Get param preset or default preset
 4689                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4690
 4691                    # Try to find if preset is a file
 4692                    if os.path.exists(param_exomiser_preset):
 4693                        # Preset file is provided in full path
 4694                        param_exomiser_analysis_default_config_file = (
 4695                            param_exomiser_preset
 4696                        )
 4697                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4698                    #     # Preset file is provided in full path
 4699                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4700                    elif os.path.exists(
 4701                        os.path.join(folder_config, param_exomiser_preset)
 4702                    ):
 4703                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4704                        param_exomiser_analysis_default_config_file = os.path.join(
 4705                            folder_config, param_exomiser_preset
 4706                        )
 4707                    else:
 4708                        # Construct preset file
 4709                        param_exomiser_analysis_default_config_file = os.path.join(
 4710                            folder_config,
 4711                            f"preset-{param_exomiser_preset}-analysis.json",
 4712                        )
 4713
 4714                    # If preset file exists
 4715                    param_exomiser_analysis_default_config_file = full_path(
 4716                        param_exomiser_analysis_default_config_file
 4717                    )
 4718                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4719                        # Load prest file into analysis dict (either yaml or json)
 4720                        with open(
 4721                            param_exomiser_analysis_default_config_file
 4722                        ) as json_file:
 4723                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4724                                json_file
 4725                            )
 4726
 4727                    # Error preset file
 4728                    else:
 4729                        log.error(
 4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4731                        )
 4732                        raise ValueError(
 4733                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4734                        )
 4735
 4736                # If no analysis dict created
 4737                if not param_exomiser_analysis_dict:
 4738                    log.error(f"No analysis config")
 4739                    raise ValueError(f"No analysis config")
 4740
 4741                # Log
 4742                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4743
 4744                ### PHENOPACKET ###
 4745                ###################
 4746
 4747                # If no PhenoPacket in analysis dict -> check in param
 4748                if "phenopacket" not in param_exomiser_analysis_dict:
 4749
 4750                    # If PhenoPacket in param -> load anlaysis json
 4751                    if param_exomiser.get("phenopacket", None):
 4752
 4753                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4754                        param_exomiser_phenopacket = full_path(
 4755                            param_exomiser_phenopacket
 4756                        )
 4757
 4758                        # If param phenopacket is a file and exists
 4759                        if isinstance(
 4760                            param_exomiser_phenopacket, str
 4761                        ) and os.path.exists(param_exomiser_phenopacket):
 4762                            # Load phenopacket file into analysis dict (either yaml or json)
 4763                            with open(param_exomiser_phenopacket) as json_file:
 4764                                param_exomiser_analysis_dict["phenopacket"] = (
 4765                                    yaml.safe_load(json_file)
 4766                                )
 4767
 4768                        # If param phenopacket is a dict
 4769                        elif isinstance(param_exomiser_phenopacket, dict):
 4770                            # Load phenopacket dict into analysis dict (either yaml or json)
 4771                            param_exomiser_analysis_dict["phenopacket"] = (
 4772                                param_exomiser_phenopacket
 4773                            )
 4774
 4775                        # Error phenopacket type
 4776                        else:
 4777                            log.error(f"Phenopacket type unknown. Check param file.")
 4778                            raise ValueError(
 4779                                f"Phenopacket type unknown. Check param file."
 4780                            )
 4781
 4782                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4783                if "phenopacket" not in param_exomiser_analysis_dict:
 4784
 4785                    # Init PhenoPacket
 4786                    param_exomiser_analysis_dict["phenopacket"] = {
 4787                        "id": "analysis",
 4788                        "proband": {},
 4789                    }
 4790
 4791                    ### Add subject ###
 4792
 4793                    # If subject exists
 4794                    param_exomiser_subject = param_exomiser.get("subject", {})
 4795
 4796                    # If subject not exists -> found sample ID
 4797                    if not param_exomiser_subject:
 4798
 4799                        # Found sample ID in param
 4800                        sample = param_exomiser.get("sample", None)
 4801
 4802                        # Find sample ID (first sample)
 4803                        if not sample:
 4804                            sample_list = self.get_header_sample_list()
 4805                            if len(sample_list) > 0:
 4806                                sample = sample_list[0]
 4807                            else:
 4808                                log.error(f"No sample found")
 4809                                raise ValueError(f"No sample found")
 4810
 4811                        # Create subject
 4812                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4813
 4814                    # Add to dict
 4815                    param_exomiser_analysis_dict["phenopacket"][
 4816                        "subject"
 4817                    ] = param_exomiser_subject
 4818
 4819                    ### Add "phenotypicFeatures" ###
 4820
 4821                    # If phenotypicFeatures exists
 4822                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4823                        "phenotypicFeatures", []
 4824                    )
 4825
 4826                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4827                    if not param_exomiser_phenotypicfeatures:
 4828
 4829                        # Found HPO in param
 4830                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4831
 4832                        # Split HPO if list in string format separated by comma
 4833                        if isinstance(param_exomiser_hpo, str):
 4834                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4835
 4836                        # Create HPO list
 4837                        for hpo in param_exomiser_hpo:
 4838                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4839                            param_exomiser_phenotypicfeatures.append(
 4840                                {
 4841                                    "type": {
 4842                                        "id": f"HP:{hpo_clean}",
 4843                                        "label": f"HP:{hpo_clean}",
 4844                                    }
 4845                                }
 4846                            )
 4847
 4848                    # Add to dict
 4849                    param_exomiser_analysis_dict["phenopacket"][
 4850                        "phenotypicFeatures"
 4851                    ] = param_exomiser_phenotypicfeatures
 4852
 4853                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4854                    if not param_exomiser_phenotypicfeatures:
 4855                        for step in param_exomiser_analysis_dict.get(
 4856                            "analysis", {}
 4857                        ).get("steps", []):
 4858                            if "hiPhivePrioritiser" in step:
 4859                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4860                                    "steps", []
 4861                                ).remove(step)
 4862
 4863                ### Add Input File ###
 4864
 4865                # Initial file name and htsFiles
 4866                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4867                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4868                    {
 4869                        "uri": tmp_vcf_name,
 4870                        "htsFormat": "VCF",
 4871                        "genomeAssembly": assembly,
 4872                    }
 4873                ]
 4874
 4875                ### Add metaData ###
 4876
 4877                # If metaData not in analysis dict
 4878                if "metaData" not in param_exomiser_analysis_dict:
 4879                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4880                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4881                        "createdBy": "howard",
 4882                        "phenopacketSchemaVersion": 1,
 4883                    }
 4884
 4885                ### OutputOptions ###
 4886
 4887                # Init output result folder
 4888                output_results = os.path.join(tmp_dir, "results")
 4889
 4890                # If no outputOptions in analysis dict
 4891                if "outputOptions" not in param_exomiser_analysis_dict:
 4892
 4893                    # default output formats
 4894                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4895
 4896                    # Get outputOptions in param
 4897                    output_options = param_exomiser.get("outputOptions", None)
 4898
 4899                    # If no output_options in param -> check
 4900                    if not output_options:
 4901                        output_options = {
 4902                            "outputContributingVariantsOnly": False,
 4903                            "numGenes": 0,
 4904                            "outputFormats": defaut_output_formats,
 4905                        }
 4906
 4907                    # Replace outputDirectory in output options
 4908                    output_options["outputDirectory"] = output_results
 4909                    output_options["outputFileName"] = "howard"
 4910
 4911                    # Add outputOptions in analysis dict
 4912                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4913
 4914                else:
 4915
 4916                    # Replace output_results and output format (if exists in param)
 4917                    param_exomiser_analysis_dict["outputOptions"][
 4918                        "outputDirectory"
 4919                    ] = output_results
 4920                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4921                        list(
 4922                            set(
 4923                                param_exomiser_analysis_dict.get(
 4924                                    "outputOptions", {}
 4925                                ).get("outputFormats", [])
 4926                                + ["TSV_VARIANT", "VCF"]
 4927                            )
 4928                        )
 4929                    )
 4930
 4931                # log
 4932                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4933
 4934                ### ANALYSIS FILE ###
 4935                #####################
 4936
 4937                ### Full JSON analysis config file ###
 4938
 4939                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4940                with open(exomiser_analysis, "w") as fp:
 4941                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4942
 4943                ### SPLIT analysis and sample config files
 4944
 4945                # Splitted analysis dict
 4946                param_exomiser_analysis_dict_for_split = (
 4947                    param_exomiser_analysis_dict.copy()
 4948                )
 4949
 4950                # Phenopacket JSON file
 4951                exomiser_analysis_phenopacket = os.path.join(
 4952                    tmp_dir, "analysis_phenopacket.json"
 4953                )
 4954                with open(exomiser_analysis_phenopacket, "w") as fp:
 4955                    json.dump(
 4956                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4957                        fp,
 4958                        indent=4,
 4959                    )
 4960
 4961                # Analysis JSON file without Phenopacket parameters
 4962                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4963                exomiser_analysis_analysis = os.path.join(
 4964                    tmp_dir, "analysis_analysis.json"
 4965                )
 4966                with open(exomiser_analysis_analysis, "w") as fp:
 4967                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4968
 4969                ### INITAL VCF file ###
 4970                #######################
 4971
 4972                ### Create list of samples to use and include inti initial VCF file ####
 4973
 4974                # Subject (main sample)
 4975                # Get sample ID in analysis dict
 4976                sample_subject = (
 4977                    param_exomiser_analysis_dict.get("phenopacket", {})
 4978                    .get("subject", {})
 4979                    .get("id", None)
 4980                )
 4981                sample_proband = (
 4982                    param_exomiser_analysis_dict.get("phenopacket", {})
 4983                    .get("proband", {})
 4984                    .get("subject", {})
 4985                    .get("id", None)
 4986                )
 4987                sample = []
 4988                if sample_subject:
 4989                    sample.append(sample_subject)
 4990                if sample_proband:
 4991                    sample.append(sample_proband)
 4992
 4993                # Get sample ID within Pedigree
 4994                pedigree_persons_list = (
 4995                    param_exomiser_analysis_dict.get("phenopacket", {})
 4996                    .get("pedigree", {})
 4997                    .get("persons", {})
 4998                )
 4999
 5000                # Create list with all sample ID in pedigree (if exists)
 5001                pedigree_persons = []
 5002                for person in pedigree_persons_list:
 5003                    pedigree_persons.append(person.get("individualId"))
 5004
 5005                # Concat subject sample ID and samples ID in pedigreesamples
 5006                samples = list(set(sample + pedigree_persons))
 5007
 5008                # Check if sample list is not empty
 5009                if not samples:
 5010                    log.error(f"No samples found")
 5011                    raise ValueError(f"No samples found")
 5012
 5013                # Create VCF with sample (either sample in param or first one by default)
 5014                # Export VCF file
 5015                self.export_variant_vcf(
 5016                    vcf_file=tmp_vcf_name,
 5017                    remove_info=True,
 5018                    add_samples=True,
 5019                    list_samples=samples,
 5020                    index=False,
 5021                )
 5022
 5023                ### Execute Exomiser ###
 5024                ########################
 5025
 5026                # Init command
 5027                exomiser_command = ""
 5028
 5029                # Command exomiser options
 5030                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 5031
 5032                # Release
 5033                exomiser_release = param_exomiser.get("release", None)
 5034                if exomiser_release:
 5035                    # phenotype data version
 5036                    exomiser_options += (
 5037                        f" --exomiser.phenotype.data-version={exomiser_release} "
 5038                    )
 5039                    # data version
 5040                    exomiser_options += (
 5041                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 5042                    )
 5043                    # variant white list
 5044                    variant_white_list_file = (
 5045                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 5046                    )
 5047                    if os.path.exists(
 5048                        os.path.join(
 5049                            databases_folders, assembly, variant_white_list_file
 5050                        )
 5051                    ):
 5052                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 5053
 5054                # transcript_source
 5055                transcript_source = param_exomiser.get(
 5056                    "transcript_source", None
 5057                )  # ucsc, refseq, ensembl
 5058                if transcript_source:
 5059                    exomiser_options += (
 5060                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 5061                    )
 5062
 5063                # If analysis contain proband param
 5064                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 5065                    "proband", {}
 5066                ):
 5067                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 5068
 5069                # If no proband (usually uniq sample)
 5070                else:
 5071                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 5072
 5073                # Log
 5074                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 5075
 5076                # Run command
 5077                result = subprocess.call(
 5078                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 5079                )
 5080                if result:
 5081                    log.error("Exomiser command failed")
 5082                    raise ValueError("Exomiser command failed")
 5083
 5084                ### RESULTS ###
 5085                ###############
 5086
 5087                ### Annotate with TSV fields ###
 5088
 5089                # Init result tsv file
 5090                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 5091
 5092                # Init result tsv file
 5093                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 5094
 5095                # Parse TSV file and explode columns in INFO field
 5096                if exomiser_to_info and os.path.exists(output_results_tsv):
 5097
 5098                    # Log
 5099                    log.debug("Exomiser columns to VCF INFO field")
 5100
 5101                    # Retrieve columns and types
 5102                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 5103                    output_results_tsv_df = self.get_query_to_df(query)
 5104                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 5105
 5106                    # Init concat fields for update
 5107                    sql_query_update_concat_fields = []
 5108
 5109                    # Fields to avoid
 5110                    fields_to_avoid = [
 5111                        "CONTIG",
 5112                        "START",
 5113                        "END",
 5114                        "REF",
 5115                        "ALT",
 5116                        "QUAL",
 5117                        "FILTER",
 5118                        "GENOTYPE",
 5119                    ]
 5120
 5121                    # List all columns to add into header
 5122                    for header_column in output_results_tsv_columns:
 5123
 5124                        # If header column is enable
 5125                        if header_column not in fields_to_avoid:
 5126
 5127                            # Header info type
 5128                            header_info_type = "String"
 5129                            header_column_df = output_results_tsv_df[header_column]
 5130                            header_column_df_dtype = header_column_df.dtype
 5131                            if header_column_df_dtype == object:
 5132                                if (
 5133                                    pd.to_numeric(header_column_df, errors="coerce")
 5134                                    .notnull()
 5135                                    .all()
 5136                                ):
 5137                                    header_info_type = "Float"
 5138                            else:
 5139                                header_info_type = "Integer"
 5140
 5141                            # Header info
 5142                            characters_to_validate = ["-"]
 5143                            pattern = "[" + "".join(characters_to_validate) + "]"
 5144                            header_info_name = re.sub(
 5145                                pattern,
 5146                                "_",
 5147                                f"Exomiser_{header_column}".replace("#", ""),
 5148                            )
 5149                            header_info_number = "."
 5150                            header_info_description = (
 5151                                f"Exomiser {header_column} annotation"
 5152                            )
 5153                            header_info_source = "Exomiser"
 5154                            header_info_version = "unknown"
 5155                            header_info_code = CODE_TYPE_MAP[header_info_type]
 5156                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 5157                                header_info_name,
 5158                                header_info_number,
 5159                                header_info_type,
 5160                                header_info_description,
 5161                                header_info_source,
 5162                                header_info_version,
 5163                                header_info_code,
 5164                            )
 5165
 5166                            # Add field to add for update to concat fields
 5167                            sql_query_update_concat_fields.append(
 5168                                f"""
 5169                                CASE
 5170                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 5171                                    THEN concat(
 5172                                        '{header_info_name}=',
 5173                                        table_parquet."{header_column}",
 5174                                        ';'
 5175                                        )
 5176
 5177                                    ELSE ''
 5178                                END
 5179                            """
 5180                            )
 5181
 5182                    # Update query
 5183                    sql_query_update = f"""
 5184                        UPDATE {table_variants} as table_variants
 5185                            SET INFO = concat(
 5186                                            CASE
 5187                                                WHEN INFO NOT IN ('', '.')
 5188                                                THEN INFO
 5189                                                ELSE ''
 5190                                            END,
 5191                                            CASE
 5192                                                WHEN table_variants.INFO NOT IN ('','.')
 5193                                                THEN ';'
 5194                                                ELSE ''
 5195                                            END,
 5196                                            (
 5197                                            SELECT 
 5198                                                concat(
 5199                                                    {",".join(sql_query_update_concat_fields)}
 5200                                                )
 5201                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 5202                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 5203                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 5204                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5205                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5206                                            )
 5207                                        )
 5208                            ;
 5209                        """
 5210
 5211                    # Update
 5212                    self.conn.execute(sql_query_update)
 5213
 5214                ### Annotate with VCF INFO field ###
 5215
 5216                # Init result VCF file
 5217                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 5218
 5219                # If VCF exists
 5220                if os.path.exists(output_results_vcf):
 5221
 5222                    # Log
 5223                    log.debug("Exomiser result VCF update variants")
 5224
 5225                    # Find Exomiser INFO field annotation in header
 5226                    with gzip.open(output_results_vcf, "rt") as f:
 5227                        header_list = self.read_vcf_header(f)
 5228                    exomiser_vcf_header = vcf.Reader(
 5229                        io.StringIO("\n".join(header_list))
 5230                    )
 5231
 5232                    # Add annotation INFO field to header
 5233                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 5234
 5235                    # Update variants with VCF
 5236                    self.update_from_vcf(output_results_vcf)
 5237
 5238        return True
 5239
 5240    def annotation_snpeff(self, threads: int = None) -> None:
 5241        """
 5242        This function annotate with snpEff
 5243
 5244        :param threads: The number of threads to use
 5245        :return: the value of the variable "return_value".
 5246        """
 5247
 5248        # DEBUG
 5249        log.debug("Start annotation with snpeff databases")
 5250
 5251        # Threads
 5252        if not threads:
 5253            threads = self.get_threads()
 5254        log.debug("Threads: " + str(threads))
 5255
 5256        # DEBUG
 5257        delete_tmp = True
 5258        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5259            delete_tmp = False
 5260            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5261
 5262        # Config
 5263        config = self.get_config()
 5264        log.debug("Config: " + str(config))
 5265
 5266        # Config - Folders - Databases
 5267        databases_folders = (
 5268            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 5269        )
 5270        log.debug("Databases annotations: " + str(databases_folders))
 5271
 5272        # Config - snpEff bin command
 5273        snpeff_bin_command = get_bin_command(
 5274            bin="snpEff.jar",
 5275            tool="snpeff",
 5276            bin_type="jar",
 5277            config=config,
 5278            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 5279        )
 5280        if not snpeff_bin_command:
 5281            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 5282            log.error(msg_err)
 5283            raise ValueError(msg_err)
 5284
 5285        # Config - snpEff databases
 5286        snpeff_databases = (
 5287            config.get("folders", {})
 5288            .get("databases", {})
 5289            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 5290        )
 5291        snpeff_databases = full_path(snpeff_databases)
 5292        if snpeff_databases is not None and snpeff_databases != "":
 5293            log.debug(f"Create snpEff databases folder")
 5294            if not os.path.exists(snpeff_databases):
 5295                os.makedirs(snpeff_databases)
 5296
 5297        # Param
 5298        param = self.get_param()
 5299        log.debug("Param: " + str(param))
 5300
 5301        # Param
 5302        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 5303        log.debug("Options: " + str(options))
 5304
 5305        # Param - Assembly
 5306        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5307
 5308        # Param - Options
 5309        snpeff_options = (
 5310            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 5311        )
 5312        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 5313        snpeff_csvstats = (
 5314            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 5315        )
 5316        if snpeff_stats:
 5317            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 5318            snpeff_stats = full_path(snpeff_stats)
 5319            snpeff_options += f" -stats {snpeff_stats}"
 5320        if snpeff_csvstats:
 5321            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 5322            snpeff_csvstats = full_path(snpeff_csvstats)
 5323            snpeff_options += f" -csvStats {snpeff_csvstats}"
 5324
 5325        # Data
 5326        table_variants = self.get_table_variants()
 5327
 5328        # Check if not empty
 5329        log.debug("Check if not empty")
 5330        sql_query_chromosomes = (
 5331            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5332        )
 5333        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 5334        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5335            log.info(f"VCF empty")
 5336            return
 5337
 5338        # Export in VCF
 5339        log.debug("Create initial file to annotate")
 5340        tmp_vcf = NamedTemporaryFile(
 5341            prefix=self.get_prefix(),
 5342            dir=self.get_tmp_dir(),
 5343            suffix=".vcf.gz",
 5344            delete=True,
 5345        )
 5346        tmp_vcf_name = tmp_vcf.name
 5347
 5348        # VCF header
 5349        vcf_reader = self.get_header()
 5350        log.debug("Initial header: " + str(vcf_reader.infos))
 5351
 5352        # Existing annotations
 5353        for vcf_annotation in self.get_header().infos:
 5354
 5355            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5356            log.debug(
 5357                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5358            )
 5359
 5360        # Memory limit
 5361        # if config.get("memory", None):
 5362        #     memory_limit = config.get("memory", "8G")
 5363        # else:
 5364        #     memory_limit = "8G"
 5365        memory_limit = self.get_memory("8G")
 5366        log.debug(f"memory_limit: {memory_limit}")
 5367
 5368        # snpEff java options
 5369        snpeff_java_options = (
 5370            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5371        )
 5372        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5373
 5374        force_update_annotation = True
 5375
 5376        if "ANN" not in self.get_header().infos or force_update_annotation:
 5377
 5378            # Check snpEff database
 5379            log.debug(f"Check snpEff databases {[assembly]}")
 5380            databases_download_snpeff(
 5381                folder=snpeff_databases, assemblies=[assembly], config=config
 5382            )
 5383
 5384            # Export VCF file
 5385            self.export_variant_vcf(
 5386                vcf_file=tmp_vcf_name,
 5387                remove_info=True,
 5388                add_samples=False,
 5389                index=True,
 5390            )
 5391
 5392            # Tmp file
 5393            err_files = []
 5394            tmp_annotate_vcf = NamedTemporaryFile(
 5395                prefix=self.get_prefix(),
 5396                dir=self.get_tmp_dir(),
 5397                suffix=".vcf",
 5398                delete=False,
 5399            )
 5400            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5401            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5402            err_files.append(tmp_annotate_vcf_name_err)
 5403
 5404            # Command
 5405            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5406            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5407            run_parallel_commands([snpeff_command], 1)
 5408
 5409            # Error messages
 5410            log.info(f"Error/Warning messages:")
 5411            error_message_command_all = []
 5412            error_message_command_warning = []
 5413            error_message_command_err = []
 5414            for err_file in err_files:
 5415                with open(err_file, "r") as f:
 5416                    for line in f:
 5417                        message = line.strip()
 5418                        error_message_command_all.append(message)
 5419                        if line.startswith("[W::"):
 5420                            error_message_command_warning.append(message)
 5421                        if line.startswith("[E::"):
 5422                            error_message_command_err.append(f"{err_file}: " + message)
 5423            # log info
 5424            for message in list(
 5425                set(error_message_command_err + error_message_command_warning)
 5426            ):
 5427                log.info(f"   {message}")
 5428            # debug info
 5429            for message in list(set(error_message_command_all)):
 5430                log.debug(f"   {message}")
 5431            # failed
 5432            if len(error_message_command_err):
 5433                log.error("Annotation failed: Error in commands")
 5434                raise ValueError("Annotation failed: Error in commands")
 5435
 5436            # Find annotation in header
 5437            with open(tmp_annotate_vcf_name, "rt") as f:
 5438                header_list = self.read_vcf_header(f)
 5439            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5440
 5441            for ann in annovar_vcf_header.infos:
 5442                if ann not in self.get_header().infos:
 5443                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5444
 5445            # Update variants
 5446            log.info(f"Annotation - Updating...")
 5447            self.update_from_vcf(tmp_annotate_vcf_name)
 5448
 5449        else:
 5450            if "ANN" in self.get_header().infos:
 5451                log.debug(f"Existing snpEff annotations in VCF")
 5452            if force_update_annotation:
 5453                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5454
 5455    def annotation_annovar(self, threads: int = None) -> None:
 5456        """
 5457        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5458        annotations
 5459
 5460        :param threads: number of threads to use
 5461        :return: the value of the variable "return_value".
 5462        """
 5463
 5464        # DEBUG
 5465        log.debug("Start annotation with Annovar databases")
 5466
 5467        # Threads
 5468        if not threads:
 5469            threads = self.get_threads()
 5470        log.debug("Threads: " + str(threads))
 5471
 5472        # Tmp en Err files
 5473        tmp_files = []
 5474        err_files = []
 5475
 5476        # DEBUG
 5477        delete_tmp = True
 5478        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5479            delete_tmp = False
 5480            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5481
 5482        # Config
 5483        config = self.get_config()
 5484        log.debug("Config: " + str(config))
 5485
 5486        # Config - Folders - Databases
 5487        databases_folders = (
 5488            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5489        )
 5490        log.debug("Databases annotations: " + str(databases_folders))
 5491
 5492        # Config - annovar bin command
 5493        annovar_bin_command = get_bin_command(
 5494            bin="table_annovar.pl",
 5495            tool="annovar",
 5496            bin_type="perl",
 5497            config=config,
 5498            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5499        )
 5500        if not annovar_bin_command:
 5501            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5502            log.error(msg_err)
 5503            raise ValueError(msg_err)
 5504
 5505        # Config - BCFTools bin command
 5506        bcftools_bin_command = get_bin_command(
 5507            bin="bcftools",
 5508            tool="bcftools",
 5509            bin_type="bin",
 5510            config=config,
 5511            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5512        )
 5513        if not bcftools_bin_command:
 5514            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5515            log.error(msg_err)
 5516            raise ValueError(msg_err)
 5517
 5518        # Config - annovar databases
 5519        annovar_databases = (
 5520            config.get("folders", {})
 5521            .get("databases", {})
 5522            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5523        )
 5524        if annovar_databases is not None:
 5525            if isinstance(annovar_databases, list):
 5526                annovar_databases = full_path(annovar_databases[0])
 5527                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
 5528            annovar_databases = full_path(annovar_databases)
 5529            if not os.path.exists(annovar_databases):
 5530                log.info(f"Annovar databases folder '{annovar_databases}' created")
 5531                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
 5532        else:
 5533            msg_err = f"Annovar databases configuration failed"
 5534            log.error(msg_err)
 5535            raise ValueError(msg_err)
 5536
 5537        # Param
 5538        param = self.get_param()
 5539        log.debug("Param: " + str(param))
 5540
 5541        # Param - options
 5542        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5543        log.debug("Options: " + str(options))
 5544
 5545        # Param - annotations
 5546        annotations = (
 5547            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5548        )
 5549        log.debug("Annotations: " + str(annotations))
 5550
 5551        # Param - Assembly
 5552        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5553
 5554        # Annovar database assembly
 5555        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5556        if annovar_databases_assembly != "" and not os.path.exists(
 5557            annovar_databases_assembly
 5558        ):
 5559            os.makedirs(annovar_databases_assembly)
 5560
 5561        # Data
 5562        table_variants = self.get_table_variants()
 5563
 5564        # Check if not empty
 5565        log.debug("Check if not empty")
 5566        sql_query_chromosomes = (
 5567            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5568        )
 5569        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5570        if not sql_query_chromosomes_df["count"][0]:
 5571            log.info(f"VCF empty")
 5572            return
 5573
 5574        # VCF header
 5575        vcf_reader = self.get_header()
 5576        log.debug("Initial header: " + str(vcf_reader.infos))
 5577
 5578        # Existing annotations
 5579        for vcf_annotation in self.get_header().infos:
 5580
 5581            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5582            log.debug(
 5583                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5584            )
 5585
 5586        force_update_annotation = True
 5587
 5588        if annotations:
 5589
 5590            commands = []
 5591            tmp_annotates_vcf_name_list = []
 5592
 5593            # Export in VCF
 5594            log.debug("Create initial file to annotate")
 5595            tmp_vcf = NamedTemporaryFile(
 5596                prefix=self.get_prefix(),
 5597                dir=self.get_tmp_dir(),
 5598                suffix=".vcf.gz",
 5599                delete=False,
 5600            )
 5601            tmp_vcf_name = tmp_vcf.name
 5602            tmp_files.append(tmp_vcf_name)
 5603            tmp_files.append(tmp_vcf_name + ".tbi")
 5604
 5605            # Export VCF file
 5606            self.export_variant_vcf(
 5607                vcf_file=tmp_vcf_name,
 5608                remove_info=".",
 5609                add_samples=False,
 5610                index=True,
 5611            )
 5612
 5613            # Create file for field rename
 5614            log.debug("Create file for field rename")
 5615            tmp_rename = NamedTemporaryFile(
 5616                prefix=self.get_prefix(),
 5617                dir=self.get_tmp_dir(),
 5618                suffix=".rename",
 5619                delete=False,
 5620            )
 5621            tmp_rename_name = tmp_rename.name
 5622            tmp_files.append(tmp_rename_name)
 5623
 5624            # Check Annovar database
 5625            log.debug(
 5626                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5627            )
 5628            databases_download_annovar(
 5629                folder=annovar_databases,
 5630                files=list(annotations.keys()),
 5631                assemblies=[assembly],
 5632            )
 5633
 5634            for annotation in annotations:
 5635                annotation_fields = annotations[annotation]
 5636
 5637                if not annotation_fields:
 5638                    annotation_fields = {"INFO": None}
 5639
 5640                log.info(f"Annotations Annovar - database '{annotation}'")
 5641                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5642
 5643                # Tmp file for annovar
 5644                err_files = []
 5645                tmp_annotate_vcf_directory = TemporaryDirectory(
 5646                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5647                )
 5648                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5649                tmp_annotate_vcf_name_annovar = (
 5650                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5651                )
 5652                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5653                err_files.append(tmp_annotate_vcf_name_err)
 5654                tmp_files.append(tmp_annotate_vcf_name_err)
 5655
 5656                # Tmp file final vcf annotated by annovar
 5657                tmp_annotate_vcf = NamedTemporaryFile(
 5658                    prefix=self.get_prefix(),
 5659                    dir=self.get_tmp_dir(),
 5660                    suffix=".vcf.gz",
 5661                    delete=False,
 5662                )
 5663                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5664                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5665                tmp_files.append(tmp_annotate_vcf_name)
 5666                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5667
 5668                # Number of fields
 5669                annotation_list = []
 5670                annotation_renamed_list = []
 5671
 5672                for annotation_field in annotation_fields:
 5673
 5674                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5675                    annotation_fields_new_name = annotation_fields.get(
 5676                        annotation_field, annotation_field
 5677                    )
 5678                    if not annotation_fields_new_name:
 5679                        annotation_fields_new_name = annotation_field
 5680
 5681                    if (
 5682                        force_update_annotation
 5683                        or annotation_fields_new_name not in self.get_header().infos
 5684                    ):
 5685                        annotation_list.append(annotation_field)
 5686                        annotation_renamed_list.append(annotation_fields_new_name)
 5687                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5688                        log.warning(
 5689                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5690                        )
 5691
 5692                    # Add rename info
 5693                    run_parallel_commands(
 5694                        [
 5695                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5696                        ],
 5697                        1,
 5698                    )
 5699
 5700                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5701                log.debug("annotation_list: " + str(annotation_list))
 5702
 5703                # protocol
 5704                protocol = annotation
 5705
 5706                # argument
 5707                argument = ""
 5708
 5709                # operation
 5710                operation = "f"
 5711                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5712                    "ensGene"
 5713                ):
 5714                    operation = "g"
 5715                    if options.get("genebase", None):
 5716                        argument = f"""'{options.get("genebase","")}'"""
 5717                elif annotation in ["cytoBand"]:
 5718                    operation = "r"
 5719
 5720                # argument option
 5721                argument_option = ""
 5722                if argument != "":
 5723                    argument_option = " --argument " + argument
 5724
 5725                # command options
 5726                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5727                for option in options:
 5728                    if option not in ["genebase"]:
 5729                        command_options += f""" --{option}={options[option]}"""
 5730
 5731                # Command
 5732
 5733                # Command - Annovar
 5734                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5735                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5736
 5737                # Command - start pipe
 5738                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5739
 5740                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5741                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5742
 5743                # Command - Special characters (refGene annotation)
 5744                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5745
 5746                # Command - Clean empty fields (with value ".")
 5747                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5748
 5749                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5750                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5751                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5752                    # for ann in annotation_renamed_list:
 5753                    for ann in annotation_list:
 5754                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5755
 5756                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5757
 5758                # Command - indexing
 5759                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5760
 5761                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5762                run_parallel_commands([command_annovar], 1)
 5763
 5764                # Error messages
 5765                log.info(f"Error/Warning messages:")
 5766                error_message_command_all = []
 5767                error_message_command_warning = []
 5768                error_message_command_err = []
 5769                for err_file in err_files:
 5770                    with open(err_file, "r") as f:
 5771                        for line in f:
 5772                            message = line.strip()
 5773                            error_message_command_all.append(message)
 5774                            if line.startswith("[W::") or line.startswith("WARNING"):
 5775                                error_message_command_warning.append(message)
 5776                            if line.startswith("[E::") or line.startswith("ERROR"):
 5777                                error_message_command_err.append(
 5778                                    f"{err_file}: " + message
 5779                                )
 5780                # log info
 5781                for message in list(
 5782                    set(error_message_command_err + error_message_command_warning)
 5783                ):
 5784                    log.info(f"   {message}")
 5785                # debug info
 5786                for message in list(set(error_message_command_all)):
 5787                    log.debug(f"   {message}")
 5788                # failed
 5789                if len(error_message_command_err):
 5790                    log.error("Annotation failed: Error in commands")
 5791                    raise ValueError("Annotation failed: Error in commands")
 5792
 5793            if tmp_annotates_vcf_name_list:
 5794
 5795                # List of annotated files
 5796                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5797
 5798                # Tmp file
 5799                tmp_annotate_vcf = NamedTemporaryFile(
 5800                    prefix=self.get_prefix(),
 5801                    dir=self.get_tmp_dir(),
 5802                    suffix=".vcf.gz",
 5803                    delete=False,
 5804                )
 5805                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5806                tmp_files.append(tmp_annotate_vcf_name)
 5807                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5808                err_files.append(tmp_annotate_vcf_name_err)
 5809                tmp_files.append(tmp_annotate_vcf_name_err)
 5810
 5811                # Command merge
 5812                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5813                log.info(
 5814                    f"Annotation Annovar - Annotation merging "
 5815                    + str(len(tmp_annotates_vcf_name_list))
 5816                    + " annotated files"
 5817                )
 5818                log.debug(f"Annotation - merge command: {merge_command}")
 5819                run_parallel_commands([merge_command], 1)
 5820
 5821                # Find annotation in header
 5822                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5823                    header_list = self.read_vcf_header(f)
 5824                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5825
 5826                for ann in annovar_vcf_header.infos:
 5827                    if ann not in self.get_header().infos:
 5828                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5829
 5830                # Update variants
 5831                log.info(f"Annotation Annovar - Updating...")
 5832                self.update_from_vcf(tmp_annotate_vcf_name)
 5833
 5834            # Clean files
 5835            # Tmp file remove command
 5836            if True:
 5837                tmp_files_remove_command = ""
 5838                if tmp_files:
 5839                    tmp_files_remove_command = " ".join(tmp_files)
 5840                clean_command = f" rm -f {tmp_files_remove_command} "
 5841                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5842                log.debug(f"Annotation - cleaning command: {clean_command}")
 5843                run_parallel_commands([clean_command], 1)
 5844
 5845    # Parquet
 5846    def annotation_parquet(self, threads: int = None) -> None:
 5847        """
 5848        It takes a VCF file, and annotates it with a parquet file
 5849
 5850        :param threads: number of threads to use for the annotation
 5851        :return: the value of the variable "result".
 5852        """
 5853
 5854        # DEBUG
 5855        log.debug("Start annotation with parquet databases")
 5856
 5857        # Threads
 5858        if not threads:
 5859            threads = self.get_threads()
 5860        log.debug("Threads: " + str(threads))
 5861
 5862        # DEBUG
 5863        delete_tmp = True
 5864        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5865            delete_tmp = False
 5866            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5867
 5868        # Config
 5869        databases_folders = set(
 5870            self.get_config()
 5871            .get("folders", {})
 5872            .get("databases", {})
 5873            .get("annotations", ["."])
 5874            + self.get_config()
 5875            .get("folders", {})
 5876            .get("databases", {})
 5877            .get("parquet", ["."])
 5878        )
 5879        log.debug("Databases annotations: " + str(databases_folders))
 5880
 5881        # Param
 5882        annotations = (
 5883            self.get_param()
 5884            .get("annotation", {})
 5885            .get("parquet", {})
 5886            .get("annotations", None)
 5887        )
 5888        log.debug("Annotations: " + str(annotations))
 5889
 5890        # Assembly
 5891        assembly = self.get_param().get(
 5892            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5893        )
 5894
 5895        # Force Update Annotation
 5896        force_update_annotation = (
 5897            self.get_param()
 5898            .get("annotation", {})
 5899            .get("options", {})
 5900            .get("annotations_update", False)
 5901        )
 5902        log.debug(f"force_update_annotation={force_update_annotation}")
 5903        force_append_annotation = (
 5904            self.get_param()
 5905            .get("annotation", {})
 5906            .get("options", {})
 5907            .get("annotations_append", False)
 5908        )
 5909        log.debug(f"force_append_annotation={force_append_annotation}")
 5910
 5911        # Data
 5912        table_variants = self.get_table_variants()
 5913
 5914        # Check if not empty
 5915        log.debug("Check if not empty")
 5916        sql_query_chromosomes_df = self.get_query_to_df(
 5917            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5918        )
 5919        if not sql_query_chromosomes_df["count"][0]:
 5920            log.info(f"VCF empty")
 5921            return
 5922
 5923        # VCF header
 5924        vcf_reader = self.get_header()
 5925        log.debug("Initial header: " + str(vcf_reader.infos))
 5926
 5927        # Nb Variants POS
 5928        log.debug("NB Variants Start")
 5929        nb_variants = self.conn.execute(
 5930            f"SELECT count(*) AS count FROM variants"
 5931        ).fetchdf()["count"][0]
 5932        log.debug("NB Variants Stop")
 5933
 5934        # Existing annotations
 5935        for vcf_annotation in self.get_header().infos:
 5936
 5937            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5938            log.debug(
 5939                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5940            )
 5941
 5942        # Added columns
 5943        added_columns = []
 5944
 5945        # drop indexes
 5946        log.debug(f"Drop indexes...")
 5947        self.drop_indexes()
 5948
 5949        if annotations:
 5950
 5951            if "ALL" in annotations:
 5952
 5953                all_param = annotations.get("ALL", {})
 5954                all_param_formats = all_param.get("formats", None)
 5955                all_param_releases = all_param.get("releases", None)
 5956
 5957                databases_infos_dict = self.scan_databases(
 5958                    database_formats=all_param_formats,
 5959                    database_releases=all_param_releases,
 5960                )
 5961                for database_infos in databases_infos_dict.keys():
 5962                    if database_infos not in annotations:
 5963                        annotations[database_infos] = {"INFO": None}
 5964
 5965            for annotation in annotations:
 5966
 5967                if annotation in ["ALL"]:
 5968                    continue
 5969
 5970                # Annotation Name
 5971                annotation_name = os.path.basename(annotation)
 5972
 5973                # Annotation fields
 5974                annotation_fields = annotations[annotation]
 5975                if not annotation_fields:
 5976                    annotation_fields = {"INFO": None}
 5977
 5978                log.debug(f"Annotation '{annotation_name}'")
 5979                log.debug(
 5980                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5981                )
 5982
 5983                # Create Database
 5984                database = Database(
 5985                    database=annotation,
 5986                    databases_folders=databases_folders,
 5987                    assembly=assembly,
 5988                )
 5989
 5990                # Find files
 5991                parquet_file = database.get_database()
 5992                parquet_hdr_file = database.get_header_file()
 5993                parquet_type = database.get_type()
 5994
 5995                # Check if files exists
 5996                if not parquet_file or not parquet_hdr_file:
 5997                    msg_err_list = []
 5998                    if not parquet_file:
 5999                        msg_err_list.append(
 6000                            f"Annotation failed: Annotation file not found"
 6001                        )
 6002                    if parquet_file and not parquet_hdr_file:
 6003                        msg_err_list.append(
 6004                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
 6005                        )
 6006
 6007                    log.error(". ".join(msg_err_list))
 6008                    raise ValueError(". ".join(msg_err_list))
 6009                else:
 6010                    # Get parquet connexion
 6011                    parquet_sql_attach = database.get_sql_database_attach(
 6012                        output="query"
 6013                    )
 6014                    if parquet_sql_attach:
 6015                        self.conn.execute(parquet_sql_attach)
 6016                    parquet_file_link = database.get_sql_database_link()
 6017                    # Log
 6018                    log.debug(
 6019                        f"Annotation '{annotation_name}' - file: "
 6020                        + str(parquet_file)
 6021                        + " and "
 6022                        + str(parquet_hdr_file)
 6023                    )
 6024
 6025                    # Database full header columns
 6026                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 6027                        parquet_hdr_file
 6028                    )
 6029                    # Log
 6030                    log.debug(
 6031                        "Annotation database header columns : "
 6032                        + str(parquet_hdr_vcf_header_columns)
 6033                    )
 6034
 6035                    # Load header as VCF object
 6036                    parquet_hdr_vcf_header_infos = database.get_header().infos
 6037                    # Log
 6038                    log.debug(
 6039                        "Annotation database header: "
 6040                        + str(parquet_hdr_vcf_header_infos)
 6041                    )
 6042
 6043                    # Get extra infos
 6044                    parquet_columns = database.get_extra_columns()
 6045                    # Log
 6046                    log.debug("Annotation database Columns: " + str(parquet_columns))
 6047
 6048                    # Add extra columns if "ALL" in annotation_fields
 6049                    # if "ALL" in annotation_fields:
 6050                    #     allow_add_extra_column = True
 6051                    if "ALL" in annotation_fields and database.get_extra_columns():
 6052                        for extra_column in database.get_extra_columns():
 6053                            if (
 6054                                extra_column not in annotation_fields
 6055                                and extra_column.replace("INFO/", "")
 6056                                not in parquet_hdr_vcf_header_infos
 6057                            ):
 6058                                parquet_hdr_vcf_header_infos[extra_column] = (
 6059                                    vcf.parser._Info(
 6060                                        extra_column,
 6061                                        ".",
 6062                                        "String",
 6063                                        f"{extra_column} description",
 6064                                        "unknown",
 6065                                        "unknown",
 6066                                        self.code_type_map["String"],
 6067                                    )
 6068                                )
 6069
 6070                    # For all fields in database
 6071                    annotation_fields_all = False
 6072                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 6073                        annotation_fields_all = True
 6074                        annotation_fields = {
 6075                            key: key for key in parquet_hdr_vcf_header_infos
 6076                        }
 6077
 6078                        log.debug(
 6079                            "Annotation database header - All annotations added: "
 6080                            + str(annotation_fields)
 6081                        )
 6082
 6083                    # Init
 6084
 6085                    # List of annotation fields to use
 6086                    sql_query_annotation_update_info_sets = []
 6087
 6088                    # List of annotation to agregate
 6089                    sql_query_annotation_to_agregate = []
 6090
 6091                    # Number of fields
 6092                    nb_annotation_field = 0
 6093
 6094                    # Annotation fields processed
 6095                    annotation_fields_processed = []
 6096
 6097                    # Columns mapping
 6098                    map_columns = database.map_columns(
 6099                        columns=annotation_fields, prefixes=["INFO/"]
 6100                    )
 6101
 6102                    # Query dict for fields to remove (update option)
 6103                    query_dict_remove = {}
 6104
 6105                    # Fetch Anotation fields
 6106                    for annotation_field in annotation_fields:
 6107
 6108                        # annotation_field_column
 6109                        annotation_field_column = map_columns.get(
 6110                            annotation_field, "INFO"
 6111                        )
 6112
 6113                        # field new name, if parametered
 6114                        annotation_fields_new_name = annotation_fields.get(
 6115                            annotation_field, annotation_field
 6116                        )
 6117                        if not annotation_fields_new_name:
 6118                            annotation_fields_new_name = annotation_field
 6119
 6120                        # To annotate
 6121                        # force_update_annotation = True
 6122                        # force_append_annotation = True
 6123                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 6124                        if annotation_field in parquet_hdr_vcf_header_infos and (
 6125                            force_update_annotation
 6126                            or force_append_annotation
 6127                            or (
 6128                                annotation_fields_new_name
 6129                                not in self.get_header().infos
 6130                            )
 6131                        ):
 6132
 6133                            # Add field to annotation to process list
 6134                            annotation_fields_processed.append(
 6135                                annotation_fields_new_name
 6136                            )
 6137
 6138                            # explode infos for the field
 6139                            annotation_fields_new_name_info_msg = ""
 6140                            if (
 6141                                force_update_annotation
 6142                                and annotation_fields_new_name
 6143                                in self.get_header().infos
 6144                            ):
 6145                                # Remove field from INFO
 6146                                query = f"""
 6147                                    UPDATE {table_variants} as table_variants
 6148                                    SET INFO = REGEXP_REPLACE(
 6149                                                concat(table_variants.INFO,''),
 6150                                                ';*{annotation_fields_new_name}=[^;]*',
 6151                                                ''
 6152                                                )
 6153                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 6154                                """
 6155                                annotation_fields_new_name_info_msg = " [update]"
 6156                                query_dict_remove[
 6157                                    f"remove 'INFO/{annotation_fields_new_name}'"
 6158                                ] = query
 6159
 6160                            # Sep between fields in INFO
 6161                            nb_annotation_field += 1
 6162                            if nb_annotation_field > 1:
 6163                                annotation_field_sep = ";"
 6164                            else:
 6165                                annotation_field_sep = ""
 6166
 6167                            log.info(
 6168                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 6169                            )
 6170
 6171                            # Add INFO field to header
 6172                            parquet_hdr_vcf_header_infos_number = (
 6173                                parquet_hdr_vcf_header_infos[annotation_field].num
 6174                                or "."
 6175                            )
 6176                            parquet_hdr_vcf_header_infos_type = (
 6177                                parquet_hdr_vcf_header_infos[annotation_field].type
 6178                                or "String"
 6179                            )
 6180                            parquet_hdr_vcf_header_infos_description = (
 6181                                parquet_hdr_vcf_header_infos[annotation_field].desc
 6182                                or f"{annotation_field} description"
 6183                            )
 6184                            parquet_hdr_vcf_header_infos_source = (
 6185                                parquet_hdr_vcf_header_infos[annotation_field].source
 6186                                or "unknown"
 6187                            )
 6188                            parquet_hdr_vcf_header_infos_version = (
 6189                                parquet_hdr_vcf_header_infos[annotation_field].version
 6190                                or "unknown"
 6191                            )
 6192
 6193                            vcf_reader.infos[annotation_fields_new_name] = (
 6194                                vcf.parser._Info(
 6195                                    annotation_fields_new_name,
 6196                                    parquet_hdr_vcf_header_infos_number,
 6197                                    parquet_hdr_vcf_header_infos_type,
 6198                                    parquet_hdr_vcf_header_infos_description,
 6199                                    parquet_hdr_vcf_header_infos_source,
 6200                                    parquet_hdr_vcf_header_infos_version,
 6201                                    self.code_type_map[
 6202                                        parquet_hdr_vcf_header_infos_type
 6203                                    ],
 6204                                )
 6205                            )
 6206
 6207                            # Append
 6208                            if force_append_annotation:
 6209                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 6210                            else:
 6211                                query_case_when_append = ""
 6212
 6213                            # Annotation/Update query fields
 6214                            # Found in INFO column
 6215                            if (
 6216                                annotation_field_column == "INFO"
 6217                                and "INFO" in parquet_hdr_vcf_header_columns
 6218                            ):
 6219                                sql_query_annotation_update_info_sets.append(
 6220                                    f"""
 6221                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 6222                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 6223                                        ELSE ''
 6224                                    END
 6225                                """
 6226                                )
 6227                            # Found in a specific column
 6228                            else:
 6229                                sql_query_annotation_update_info_sets.append(
 6230                                    f"""
 6231                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 6232                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 6233                                        ELSE ''
 6234                                    END
 6235                                """
 6236                                )
 6237                                sql_query_annotation_to_agregate.append(
 6238                                    f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 6239                                )
 6240
 6241                        # Not to annotate
 6242                        else:
 6243
 6244                            if force_update_annotation:
 6245                                annotation_message = "forced"
 6246                            else:
 6247                                annotation_message = "skipped"
 6248
 6249                            if annotation_field not in parquet_hdr_vcf_header_infos:
 6250                                log.warning(
 6251                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 6252                                )
 6253                            if annotation_fields_new_name in self.get_header().infos:
 6254                                log.warning(
 6255                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 6256                                )
 6257
 6258                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 6259                    # allow_annotation_full_info = True
 6260                    allow_annotation_full_info = not force_append_annotation
 6261
 6262                    if parquet_type in ["regions"]:
 6263                        allow_annotation_full_info = False
 6264
 6265                    if (
 6266                        allow_annotation_full_info
 6267                        and nb_annotation_field == len(annotation_fields)
 6268                        and annotation_fields_all
 6269                        and (
 6270                            "INFO" in parquet_hdr_vcf_header_columns
 6271                            and "INFO" in database.get_extra_columns()
 6272                        )
 6273                    ):
 6274                        log.debug("Column INFO annotation enabled")
 6275                        sql_query_annotation_update_info_sets = []
 6276                        sql_query_annotation_update_info_sets.append(
 6277                            f" table_parquet.INFO "
 6278                        )
 6279
 6280                    if sql_query_annotation_update_info_sets:
 6281
 6282                        # Annotate
 6283                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 6284
 6285                        # Join query annotation update info sets for SQL
 6286                        sql_query_annotation_update_info_sets_sql = ",".join(
 6287                            sql_query_annotation_update_info_sets
 6288                        )
 6289
 6290                        # Check chromosomes list (and variants infos)
 6291                        sql_query_chromosomes = f"""
 6292                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 6293                            FROM {table_variants} as table_variants
 6294                            GROUP BY table_variants."#CHROM"
 6295                            ORDER BY table_variants."#CHROM"
 6296                            """
 6297                        sql_query_chromosomes_df = self.conn.execute(
 6298                            sql_query_chromosomes
 6299                        ).df()
 6300                        sql_query_chromosomes_dict = {
 6301                            entry["CHROM"]: {
 6302                                "count": entry["count_variants"],
 6303                                "min": entry["min_variants"],
 6304                                "max": entry["max_variants"],
 6305                            }
 6306                            for index, entry in sql_query_chromosomes_df.iterrows()
 6307                        }
 6308
 6309                        # Init
 6310                        nb_of_query = 0
 6311                        nb_of_variant_annotated = 0
 6312                        query_dict = query_dict_remove
 6313
 6314                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 6315                        for chrom in sql_query_chromosomes_dict:
 6316
 6317                            # Number of variant by chromosome
 6318                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 6319                                chrom, {}
 6320                            ).get("count", 0)
 6321
 6322                            log.debug(
 6323                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 6324                            )
 6325
 6326                            # Annotation with regions database
 6327                            if parquet_type in ["regions"]:
 6328                                sql_query_annotation_from_clause = f"""
 6329                                    FROM (
 6330                                        SELECT 
 6331                                            '{chrom}' AS \"#CHROM\",
 6332                                            table_variants_from.\"POS\" AS \"POS\",
 6333                                            {",".join(sql_query_annotation_to_agregate)}
 6334                                        FROM {table_variants} as table_variants_from
 6335                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 6336                                            table_parquet_from."#CHROM" = '{chrom}'
 6337                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 6338                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 6339                                        )
 6340                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 6341                                        GROUP BY table_variants_from.\"POS\"
 6342                                        )
 6343                                        as table_parquet
 6344                                """
 6345
 6346                                sql_query_annotation_where_clause = """
 6347                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 6348                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6349                                """
 6350
 6351                            # Annotation with variants database
 6352                            else:
 6353                                sql_query_annotation_from_clause = f"""
 6354                                    FROM {parquet_file_link} as table_parquet
 6355                                """
 6356                                sql_query_annotation_where_clause = f"""
 6357                                    table_variants."#CHROM" = '{chrom}'
 6358                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 6359                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 6360                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 6361                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 6362                                """
 6363
 6364                            # Create update query
 6365                            sql_query_annotation_chrom_interval_pos = f"""
 6366                                UPDATE {table_variants} as table_variants
 6367                                    SET INFO = 
 6368                                        concat(
 6369                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6370                                                THEN table_variants.INFO
 6371                                                ELSE ''
 6372                                            END
 6373                                            ,
 6374                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 6375                                                        AND (
 6376                                                        concat({sql_query_annotation_update_info_sets_sql})
 6377                                                        )
 6378                                                        NOT IN ('','.') 
 6379                                                    THEN ';'
 6380                                                    ELSE ''
 6381                                            END
 6382                                            ,
 6383                                            {sql_query_annotation_update_info_sets_sql}
 6384                                            )
 6385                                    {sql_query_annotation_from_clause}
 6386                                    WHERE {sql_query_annotation_where_clause}
 6387                                    ;
 6388                                """
 6389
 6390                            # Add update query to dict
 6391                            query_dict[
 6392                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6393                            ] = sql_query_annotation_chrom_interval_pos
 6394
 6395                        nb_of_query = len(query_dict)
 6396                        num_query = 0
 6397
 6398                        # SET max_expression_depth TO x
 6399                        self.conn.execute("SET max_expression_depth TO 10000")
 6400
 6401                        for query_name in query_dict:
 6402                            query = query_dict[query_name]
 6403                            num_query += 1
 6404                            log.info(
 6405                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6406                            )
 6407                            result = self.conn.execute(query)
 6408                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6409                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6410                            log.info(
 6411                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6412                            )
 6413
 6414                        log.info(
 6415                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6416                        )
 6417
 6418                    else:
 6419
 6420                        log.info(
 6421                            f"Annotation '{annotation_name}' - No Annotations available"
 6422                        )
 6423
 6424                    log.debug("Final header: " + str(vcf_reader.infos))
 6425
 6426        # Remove added columns
 6427        for added_column in added_columns:
 6428            self.drop_column(column=added_column)
 6429
 6430    def annotation_splice(self, threads: int = None) -> None:
 6431        """
 6432        This function annotate with snpEff
 6433
 6434        :param threads: The number of threads to use
 6435        :return: the value of the variable "return_value".
 6436        """
 6437
 6438        # DEBUG
 6439        log.debug("Start annotation with splice tools")
 6440
 6441        # Threads
 6442        if not threads:
 6443            threads = self.get_threads()
 6444        log.debug("Threads: " + str(threads))
 6445
 6446        # DEBUG
 6447        delete_tmp = True
 6448        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6449            delete_tmp = False
 6450            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6451
 6452        # Config
 6453        config = self.get_config()
 6454        log.debug("Config: " + str(config))
 6455        splice_config = config.get("tools", {}).get("splice", {})
 6456        if not splice_config:
 6457            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6458            msg_err = "No Splice tool config"
 6459            raise ValueError(msg_err)
 6460        log.debug(f"splice_config: {splice_config}")
 6461
 6462        # Config - Folders - Databases
 6463        databases_folders = (
 6464            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6465        )
 6466        log.debug("Databases annotations: " + str(databases_folders))
 6467
 6468        # Splice docker image
 6469        splice_docker_image = splice_config.get("docker").get("image")
 6470
 6471        # Pull splice image if it's not already there
 6472        if not check_docker_image_exists(splice_docker_image):
 6473            log.warning(
 6474                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6475            )
 6476            try:
 6477                command(f"docker pull {splice_config.get('docker').get('image')}")
 6478            except subprocess.CalledProcessError:
 6479                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6480                log.error(msg_err)
 6481                raise ValueError(msg_err)
 6482
 6483        # Config - splice databases
 6484        splice_databases = (
 6485            config.get("folders", {})
 6486            .get("databases", {})
 6487            .get("splice", DEFAULT_SPLICE_FOLDER)
 6488        )
 6489        splice_databases = full_path(splice_databases)
 6490
 6491        # Param
 6492        param = self.get_param()
 6493        log.debug("Param: " + str(param))
 6494
 6495        # Param
 6496        options = param.get("annotation", {}).get("splice", {}).get("options", {})
 6497        log.debug("Options: " + str(options))
 6498
 6499        # Data
 6500        table_variants = self.get_table_variants()
 6501
 6502        # Check if not empty
 6503        log.debug("Check if not empty")
 6504        sql_query_chromosomes = (
 6505            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6506        )
 6507        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6508            log.info("VCF empty")
 6509            return None
 6510
 6511        # Export in VCF
 6512        log.debug("Create initial file to annotate")
 6513
 6514        # Create output folder / work folder
 6515        if options.get("output_folder", ""):
 6516            output_folder = options.get("output_folder", "")
 6517            if not os.path.exists(output_folder):
 6518                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6519        else:
 6520            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6521            if not os.path.exists(output_folder):
 6522                Path(output_folder).mkdir(parents=True, exist_ok=True)
 6523
 6524        if options.get("workdir", ""):
 6525            workdir = options.get("workdir", "")
 6526        else:
 6527            workdir = "/work"
 6528
 6529        # Create tmp VCF file
 6530        tmp_vcf = NamedTemporaryFile(
 6531            prefix=self.get_prefix(),
 6532            dir=output_folder,
 6533            suffix=".vcf",
 6534            delete=False,
 6535        )
 6536        tmp_vcf_name = tmp_vcf.name
 6537
 6538        # VCF header
 6539        header = self.get_header()
 6540
 6541        # Existing annotations
 6542        for vcf_annotation in self.get_header().infos:
 6543
 6544            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6545            log.debug(
 6546                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6547            )
 6548
 6549        # Memory limit
 6550        if config.get("memory", None):
 6551            memory_limit = config.get("memory", "8G").upper()
 6552            # upper()
 6553        else:
 6554            memory_limit = "8G"
 6555        log.debug(f"memory_limit: {memory_limit}")
 6556
 6557        # Check number of variants to annotate
 6558        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6559        where_clause_regex_spip = r"SPiP_\w+"
 6560        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6561        df_list_of_variants_to_annotate = self.get_query_to_df(
 6562            query=f""" SELECT * FROM variants {where_clause} """
 6563        )
 6564        if len(df_list_of_variants_to_annotate) == 0:
 6565            log.warning(
 6566                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6567            )
 6568            return None
 6569        else:
 6570            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6571
 6572        # Export VCF file
 6573        self.export_variant_vcf(
 6574            vcf_file=tmp_vcf_name,
 6575            remove_info=True,
 6576            add_samples=True,
 6577            index=False,
 6578            where_clause=where_clause,
 6579        )
 6580        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
 6581        if any(value for value in splice_config.values() if value is None):
 6582            log.warning("At least one splice config parameter is empty")
 6583            # exit annotation_splice
 6584            return None
 6585
 6586        # Params in splice nf
 6587        def check_values(dico: dict):
 6588            """
 6589            Ensure parameters for NF splice pipeline
 6590            """
 6591            for key, val in dico.items():
 6592                if key == "genome":
 6593                    if any(
 6594                        assemb in options.get("genome", {})
 6595                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6596                    ):
 6597                        yield f"--{key} hg19"
 6598                    elif any(
 6599                        assemb in options.get("genome", {})
 6600                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6601                    ):
 6602                        yield f"--{key} hg38"
 6603                elif (
 6604                    (isinstance(val, str) and val)
 6605                    or isinstance(val, int)
 6606                    or isinstance(val, bool)
 6607                ):
 6608                    yield f"--{key} {val}"
 6609
 6610        # Genome
 6611        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6612        options["genome"] = genome
 6613        # NF params
 6614        nf_params = []
 6615        # Add options
 6616        if options:
 6617            log.debug(options)
 6618            nf_params = list(check_values(options))
 6619            log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6620        else:
 6621            log.debug("No NF params provided")
 6622        # Add threads
 6623        if "threads" not in options.keys():
 6624            nf_params.append(f"--threads {threads}")
 6625        # Genome path
 6626        genome_path = find_genome(
 6627            config.get("folders", {})
 6628            .get("databases", {})
 6629            .get("genomes", DEFAULT_GENOME_FOLDER),
 6630            file=f"{genome}.fa",
 6631        )
 6632        # Add genome path
 6633        if not genome_path:
 6634            raise ValueError(
 6635                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6636            )
 6637        else:
 6638            log.debug(f"Genome: {genome_path}")
 6639            nf_params.append(f"--genome_path {genome_path}")
 6640
 6641        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6642            """
 6643            Setting up updated databases for SPiP and SpliceAI
 6644            """
 6645
 6646            try:
 6647
 6648                # SpliceAI assembly transcriptome
 6649                spliceai_assembly = os.path.join(
 6650                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
 6651                    options.get("genome"),
 6652                    "transcriptome",
 6653                )
 6654                spip_assembly = options.get("genome")
 6655
 6656                spip = find(
 6657                    f"transcriptome_{spip_assembly}.RData",
 6658                    config.get("folders", {}).get("databases", {}).get("spip", {}),
 6659                )
 6660                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6661                log.debug(f"SPiP annotations: {spip}")
 6662                log.debug(f"SpliceAI annotations: {spliceai}")
 6663                if spip and spliceai:
 6664                    return [
 6665                        f"--spip_transcriptome {spip}",
 6666                        f"--spliceai_transcriptome {spliceai}",
 6667                    ]
 6668                else:
 6669                    log.warning(
 6670                        "Can't find splice databases in configuration, use annotations file from image"
 6671                    )
 6672            except TypeError:
 6673                log.warning(
 6674                    "Can't find splice databases in configuration, use annotations file from image"
 6675                )
 6676                return []
 6677
 6678        # Add options, check if transcriptome option have already beend provided
 6679        if (
 6680            "spip_transcriptome" not in nf_params
 6681            and "spliceai_transcriptome" not in nf_params
 6682        ):
 6683            splice_reference = splice_annotations(options, config)
 6684            if splice_reference:
 6685                nf_params.extend(splice_reference)
 6686        # nf_params.append(f"--output_folder {output_folder}")
 6687        random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6688        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6689        log.debug(cmd)
 6690        splice_config["docker"]["command"] = cmd
 6691
 6692        # Ensure proxy is set
 6693        proxy = [
 6694            f"-e {var}={os.getenv(var)}"
 6695            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
 6696            if os.getenv(var) is not None
 6697        ]
 6698        docker_cmd = get_bin_command(
 6699            tool="splice",
 6700            bin_type="docker",
 6701            config=config,
 6702            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6703            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
 6704        )
 6705        # print(docker_cmd)
 6706        # exit()
 6707        # Docker debug
 6708        # if splice_config.get("rm_container"):
 6709        #     rm_container = "--rm"
 6710        # else:
 6711        #     rm_container = ""
 6712        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6713        log.debug(docker_cmd)
 6714        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6715        log.debug(res.stdout)
 6716        if res.stderr:
 6717            log.error(res.stderr)
 6718        res.check_returncode()
 6719        # Update variants
 6720        log.info("Annotation - Updating...")
 6721        # Test find output vcf
 6722        log.debug(
 6723            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6724        )
 6725        output_vcf = []
 6726        # Wrong folder to look in
 6727        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6728            if (
 6729                files
 6730                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6731            ):
 6732                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6733        # log.debug(os.listdir(options.get("output_folder")))
 6734        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6735        if not output_vcf:
 6736            log.debug(
 6737                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6738            )
 6739        else:
 6740            # Get new header from annotated vcf
 6741            log.debug(f"Initial header: {len(header.infos)} fields")
 6742            # Create new header with splice infos
 6743            new_vcf = Variants(input=output_vcf[0])
 6744            new_vcf_header = new_vcf.get_header().infos
 6745            for keys, infos in new_vcf_header.items():
 6746                if keys not in header.infos.keys():
 6747                    header.infos[keys] = infos
 6748            log.debug(f"New header: {len(header.infos)} fields")
 6749            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6750            self.update_from_vcf(output_vcf[0])
 6751
 6752        # Remove file
 6753        remove_if_exists(output_vcf)
 6754
 6755    ###
 6756    # Prioritization
 6757    ###
 6758
 6759    def get_config_default(self, name: str) -> dict:
 6760        """
 6761        The function `get_config_default` returns a dictionary containing default configurations for
 6762        various calculations and prioritizations.
 6763
 6764        :param name: The `get_config_default` function returns a dictionary containing default
 6765        configurations for different calculations and prioritizations. The `name` parameter is used to
 6766        specify which specific configuration to retrieve from the dictionary
 6767        :type name: str
 6768        :return: The function `get_config_default` returns a dictionary containing default configuration
 6769        settings for different calculations and prioritizations. The specific configuration settings are
 6770        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6771        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6772        returned. If there is no match, an empty dictionary is returned.
 6773        """
 6774
 6775        config_default = {
 6776            "calculations": {
 6777                "variant_chr_pos_alt_ref": {
 6778                    "type": "sql",
 6779                    "name": "variant_chr_pos_alt_ref",
 6780                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6781                    "available": False,
 6782                    "output_column_name": "variant_chr_pos_alt_ref",
 6783                    "output_column_type": "String",
 6784                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6785                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6786                    "operation_info": True,
 6787                },
 6788                "VARTYPE": {
 6789                    "type": "sql",
 6790                    "name": "VARTYPE",
 6791                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6792                    "available": True,
 6793                    "table": "variants",
 6794                    "output_column_name": "VARTYPE",
 6795                    "output_column_type": "String",
 6796                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6797                    "operation_query": """
 6798                            CASE
 6799                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6800                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6801                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6802                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6803                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6804                                ELSE 'UNDEFINED'
 6805                            END
 6806                            """,
 6807                    "info_fields": ["SVTYPE"],
 6808                    "operation_info": True,
 6809                },
 6810                "snpeff_hgvs": {
 6811                    "type": "python",
 6812                    "name": "snpeff_hgvs",
 6813                    "description": "HGVS nomenclatures from snpEff annotation",
 6814                    "available": True,
 6815                    "function_name": "calculation_extract_snpeff_hgvs",
 6816                    "function_params": ["snpeff_hgvs", "ANN"],
 6817                },
 6818                "snpeff_ann_explode": {
 6819                    "type": "python",
 6820                    "name": "snpeff_ann_explode",
 6821                    "description": "Explode snpEff annotations with uniquify values",
 6822                    "available": True,
 6823                    "function_name": "calculation_snpeff_ann_explode",
 6824                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6825                },
 6826                "snpeff_ann_explode_uniquify": {
 6827                    "type": "python",
 6828                    "name": "snpeff_ann_explode_uniquify",
 6829                    "description": "Explode snpEff annotations",
 6830                    "available": True,
 6831                    "function_name": "calculation_snpeff_ann_explode",
 6832                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6833                },
 6834                "snpeff_ann_explode_json": {
 6835                    "type": "python",
 6836                    "name": "snpeff_ann_explode_json",
 6837                    "description": "Explode snpEff annotations in JSON format",
 6838                    "available": True,
 6839                    "function_name": "calculation_snpeff_ann_explode",
 6840                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6841                },
 6842                "NOMEN": {
 6843                    "type": "python",
 6844                    "name": "NOMEN",
 6845                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
 6846                    "available": True,
 6847                    "function_name": "calculation_extract_nomen",
 6848                    "function_params": [],
 6849                },
 6850                "RENAME_INFO_FIELDS": {
 6851                    "type": "python",
 6852                    "name": "RENAME_INFO_FIELDS",
 6853                    "description": "Rename or remove INFO/tags",
 6854                    "available": True,
 6855                    "function_name": "calculation_rename_info_fields",
 6856                    "function_params": [],
 6857                },
 6858                "FINDBYPIPELINE": {
 6859                    "type": "python",
 6860                    "name": "FINDBYPIPELINE",
 6861                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6862                    "available": True,
 6863                    "function_name": "calculation_find_by_pipeline",
 6864                    "function_params": ["findbypipeline"],
 6865                },
 6866                "FINDBYSAMPLE": {
 6867                    "type": "python",
 6868                    "name": "FINDBYSAMPLE",
 6869                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6870                    "available": True,
 6871                    "function_name": "calculation_find_by_pipeline",
 6872                    "function_params": ["findbysample"],
 6873                },
 6874                "GENOTYPECONCORDANCE": {
 6875                    "type": "python",
 6876                    "name": "GENOTYPECONCORDANCE",
 6877                    "description": "Concordance of genotype for multi caller VCF",
 6878                    "available": True,
 6879                    "function_name": "calculation_genotype_concordance",
 6880                    "function_params": [],
 6881                },
 6882                "BARCODE": {
 6883                    "type": "python",
 6884                    "name": "BARCODE",
 6885                    "description": "BARCODE as VaRank tool",
 6886                    "available": True,
 6887                    "function_name": "calculation_barcode",
 6888                    "function_params": [],
 6889                },
 6890                "BARCODEFAMILY": {
 6891                    "type": "python",
 6892                    "name": "BARCODEFAMILY",
 6893                    "description": "BARCODEFAMILY as VaRank tool",
 6894                    "available": True,
 6895                    "function_name": "calculation_barcode_family",
 6896                    "function_params": ["BCF"],
 6897                },
 6898                "TRIO": {
 6899                    "type": "python",
 6900                    "name": "TRIO",
 6901                    "description": "Inheritance for a trio family",
 6902                    "available": True,
 6903                    "function_name": "calculation_trio",
 6904                    "function_params": [],
 6905                },
 6906                "VAF": {
 6907                    "type": "python",
 6908                    "name": "VAF",
 6909                    "description": "Variant Allele Frequency (VAF) harmonization",
 6910                    "available": True,
 6911                    "function_name": "calculation_vaf_normalization",
 6912                    "function_params": [],
 6913                },
 6914                "VAF_stats": {
 6915                    "type": "python",
 6916                    "name": "VAF_stats",
 6917                    "description": "Variant Allele Frequency (VAF) statistics",
 6918                    "available": True,
 6919                    "function_name": "calculation_genotype_stats",
 6920                    "function_params": ["VAF"],
 6921                },
 6922                "DP_stats": {
 6923                    "type": "python",
 6924                    "name": "DP_stats",
 6925                    "description": "Depth (DP) statistics",
 6926                    "available": True,
 6927                    "function_name": "calculation_genotype_stats",
 6928                    "function_params": ["DP"],
 6929                },
 6930                "variant_id": {
 6931                    "type": "python",
 6932                    "name": "variant_id",
 6933                    "description": "Variant ID generated from variant position and type",
 6934                    "available": True,
 6935                    "function_name": "calculation_variant_id",
 6936                    "function_params": [],
 6937                },
 6938                "transcripts_json": {
 6939                    "type": "python",
 6940                    "name": "transcripts_json",
 6941                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6942                    "available": True,
 6943                    "function_name": "calculation_transcripts_annotation",
 6944                    "function_params": ["transcripts_json", None],
 6945                },
 6946                "transcripts_ann": {
 6947                    "type": "python",
 6948                    "name": "transcripts_ann",
 6949                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6950                    "available": True,
 6951                    "function_name": "calculation_transcripts_annotation",
 6952                    "function_params": [None, "transcripts_ann"],
 6953                },
 6954                "transcripts_annotations": {
 6955                    "type": "python",
 6956                    "name": "transcripts_annotations",
 6957                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6958                    "available": True,
 6959                    "function_name": "calculation_transcripts_annotation",
 6960                    "function_params": [None, None],
 6961                },
 6962                "transcripts_prioritization": {
 6963                    "type": "python",
 6964                    "name": "transcripts_prioritization",
 6965                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6966                    "available": True,
 6967                    "function_name": "calculation_transcripts_prioritization",
 6968                    "function_params": [],
 6969                },
 6970                "transcripts_export": {
 6971                    "type": "python",
 6972                    "name": "transcripts_export",
 6973                    "description": "Export transcripts table/view as a file (using param.json)",
 6974                    "available": True,
 6975                    "function_name": "calculation_transcripts_export",
 6976                    "function_params": [],
 6977                },
 6978            },
 6979            "prioritizations": {
 6980                "default": {
 6981                    "ANN2": [
 6982                        {
 6983                            "type": "contains",
 6984                            "value": "HIGH",
 6985                            "score": 5,
 6986                            "flag": "PASS",
 6987                            "comment": [
 6988                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6989                            ],
 6990                        },
 6991                        {
 6992                            "type": "contains",
 6993                            "value": "MODERATE",
 6994                            "score": 3,
 6995                            "flag": "PASS",
 6996                            "comment": [
 6997                                "A non-disruptive variant that might change protein effectiveness"
 6998                            ],
 6999                        },
 7000                        {
 7001                            "type": "contains",
 7002                            "value": "LOW",
 7003                            "score": 0,
 7004                            "flag": "FILTERED",
 7005                            "comment": [
 7006                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 7007                            ],
 7008                        },
 7009                        {
 7010                            "type": "contains",
 7011                            "value": "MODIFIER",
 7012                            "score": 0,
 7013                            "flag": "FILTERED",
 7014                            "comment": [
 7015                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 7016                            ],
 7017                        },
 7018                    ],
 7019                }
 7020            },
 7021        }
 7022
 7023        return config_default.get(name, None)
 7024
 7025    def get_config_json(
 7026        self, name: str, config_dict: dict = {}, config_file: str = None
 7027    ) -> dict:
 7028        """
 7029        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 7030        default values, a dictionary, and a file.
 7031
 7032        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 7033        the name of the configuration. It is used to identify and retrieve the configuration settings
 7034        for a specific component or module
 7035        :type name: str
 7036        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 7037        dictionary that allows you to provide additional configuration settings or overrides. When you
 7038        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 7039        the key is the configuration setting you want to override or
 7040        :type config_dict: dict
 7041        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 7042        specify the path to a configuration file that contains additional settings. If provided, the
 7043        function will read the contents of this file and update the configuration dictionary with the
 7044        values found in the file, overriding any existing values with the
 7045        :type config_file: str
 7046        :return: The function `get_config_json` returns a dictionary containing the configuration
 7047        settings.
 7048        """
 7049
 7050        # Create with default prioritizations
 7051        config_default = self.get_config_default(name=name)
 7052        configuration = config_default
 7053        # log.debug(f"configuration={configuration}")
 7054
 7055        # Replace prioritizations from dict
 7056        for config in config_dict:
 7057            configuration[config] = config_dict[config]
 7058
 7059        # Replace prioritizations from file
 7060        config_file = full_path(config_file)
 7061        if config_file:
 7062            if os.path.exists(config_file):
 7063                with open(config_file) as config_file_content:
 7064                    config_file_dict = yaml.safe_load(config_file_content)
 7065                for config in config_file_dict:
 7066                    configuration[config] = config_file_dict[config]
 7067            else:
 7068                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 7069                log.error(msg_error)
 7070                raise ValueError(msg_error)
 7071
 7072        return configuration
 7073
 7074    def prioritization(
 7075        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 7076    ) -> bool:
 7077        """
 7078        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 7079        prioritizes variants based on configured profiles and criteria.
 7080
 7081        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 7082        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 7083        a table name is provided, the method will prioritize the variants in that specific table
 7084        :type table: str
 7085        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 7086        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 7087        provided, the code will use a default prefix value of "PZ"
 7088        :type pz_prefix: str
 7089        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 7090        additional parameters specific to the prioritization process. These parameters can include
 7091        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 7092        configurations needed for the prioritization of variants in a V
 7093        :type pz_param: dict
 7094        :return: A boolean value (True) is being returned from the `prioritization` function.
 7095        """
 7096
 7097        # Config
 7098        config = self.get_config()
 7099
 7100        # Param
 7101        param = self.get_param()
 7102
 7103        # Prioritization param
 7104        if pz_param is not None:
 7105            prioritization_param = pz_param
 7106        else:
 7107            prioritization_param = param.get("prioritization", {})
 7108
 7109        # Configuration profiles
 7110        prioritization_config_file = prioritization_param.get(
 7111            "prioritization_config", None
 7112        )
 7113        prioritization_config_file = full_path(prioritization_config_file)
 7114        prioritizations_config = self.get_config_json(
 7115            name="prioritizations", config_file=prioritization_config_file
 7116        )
 7117
 7118        # Prioritization prefix
 7119        pz_prefix_default = "PZ"
 7120        if pz_prefix is None:
 7121            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 7122
 7123        # Prioritization options
 7124        profiles = prioritization_param.get("profiles", [])
 7125        if isinstance(profiles, str):
 7126            profiles = profiles.split(",")
 7127        pzfields = prioritization_param.get(
 7128            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 7129        )
 7130        if isinstance(pzfields, str):
 7131            pzfields = pzfields.split(",")
 7132        default_profile = prioritization_param.get("default_profile", None)
 7133        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 7134        prioritization_score_mode = prioritization_param.get(
 7135            "prioritization_score_mode", "HOWARD"
 7136        )
 7137
 7138        # Quick Prioritizations
 7139        prioritizations = param.get("prioritizations", None)
 7140        if prioritizations:
 7141            log.info("Quick Prioritization:")
 7142            for profile in prioritizations.split(","):
 7143                if profile not in profiles:
 7144                    profiles.append(profile)
 7145                    log.info(f"   {profile}")
 7146
 7147        # If profile "ALL" provided, all profiles in the config profiles
 7148        if "ALL" in profiles:
 7149            profiles = list(prioritizations_config.keys())
 7150
 7151        for profile in profiles:
 7152            if prioritizations_config.get(profile, None):
 7153                log.debug(f"Profile '{profile}' configured")
 7154            else:
 7155                msg_error = f"Profile '{profile}' NOT configured"
 7156                log.error(msg_error)
 7157                raise ValueError(msg_error)
 7158
 7159        if profiles:
 7160            log.info(f"Prioritization... ")
 7161        else:
 7162            log.debug(f"No profile defined")
 7163            return False
 7164
 7165        if not default_profile and len(profiles):
 7166            default_profile = profiles[0]
 7167
 7168        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 7169        log.debug("Profiles to check: " + str(list(profiles)))
 7170
 7171        # Variables
 7172        if table is not None:
 7173            table_variants = table
 7174        else:
 7175            table_variants = self.get_table_variants(clause="update")
 7176        log.debug(f"Table to prioritize: {table_variants}")
 7177
 7178        # Added columns
 7179        added_columns = []
 7180
 7181        # Create list of PZfields
 7182        # List of PZFields
 7183        list_of_pzfields_original = pzfields + [
 7184            pzfield + pzfields_sep + profile
 7185            for pzfield in pzfields
 7186            for profile in profiles
 7187        ]
 7188        list_of_pzfields = []
 7189        log.debug(f"{list_of_pzfields_original}")
 7190
 7191        # Remove existing PZfields to use if exists
 7192        for pzfield in list_of_pzfields_original:
 7193            if self.get_header().infos.get(pzfield, None) is None:
 7194                list_of_pzfields.append(pzfield)
 7195                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 7196            else:
 7197                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 7198
 7199        if list_of_pzfields:
 7200
 7201            # Explode Infos prefix
 7202            explode_infos_prefix = self.get_explode_infos_prefix()
 7203
 7204            # PZfields tags description
 7205            PZfields_INFOS = {
 7206                f"{pz_prefix}Tags": {
 7207                    "ID": f"{pz_prefix}Tags",
 7208                    "Number": ".",
 7209                    "Type": "String",
 7210                    "Description": "Variant tags based on annotation criteria",
 7211                },
 7212                f"{pz_prefix}Score": {
 7213                    "ID": f"{pz_prefix}Score",
 7214                    "Number": 1,
 7215                    "Type": "Integer",
 7216                    "Description": "Variant score based on annotation criteria",
 7217                },
 7218                f"{pz_prefix}Flag": {
 7219                    "ID": f"{pz_prefix}Flag",
 7220                    "Number": 1,
 7221                    "Type": "String",
 7222                    "Description": "Variant flag based on annotation criteria",
 7223                },
 7224                f"{pz_prefix}Comment": {
 7225                    "ID": f"{pz_prefix}Comment",
 7226                    "Number": ".",
 7227                    "Type": "String",
 7228                    "Description": "Variant comment based on annotation criteria",
 7229                },
 7230                f"{pz_prefix}Infos": {
 7231                    "ID": f"{pz_prefix}Infos",
 7232                    "Number": ".",
 7233                    "Type": "String",
 7234                    "Description": "Variant infos based on annotation criteria",
 7235                },
 7236                f"{pz_prefix}Class": {
 7237                    "ID": f"{pz_prefix}Class",
 7238                    "Number": ".",
 7239                    "Type": "String",
 7240                    "Description": "Variant class based on annotation criteria",
 7241                },
 7242            }
 7243
 7244            # Create INFO fields if not exist
 7245            for field in PZfields_INFOS:
 7246                field_ID = PZfields_INFOS[field]["ID"]
 7247                field_description = PZfields_INFOS[field]["Description"]
 7248                if field_ID not in self.get_header().infos and field_ID in pzfields:
 7249                    field_description = (
 7250                        PZfields_INFOS[field]["Description"]
 7251                        + f", profile {default_profile}"
 7252                    )
 7253                    self.get_header().infos[field_ID] = vcf.parser._Info(
 7254                        field_ID,
 7255                        PZfields_INFOS[field]["Number"],
 7256                        PZfields_INFOS[field]["Type"],
 7257                        field_description,
 7258                        "unknown",
 7259                        "unknown",
 7260                        code_type_map[PZfields_INFOS[field]["Type"]],
 7261                    )
 7262
 7263            # Create INFO fields if not exist for each profile
 7264            for profile in prioritizations_config:
 7265                if profile in profiles or profiles == []:
 7266                    for field in PZfields_INFOS:
 7267                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 7268                        field_description = (
 7269                            PZfields_INFOS[field]["Description"]
 7270                            + f", profile {profile}"
 7271                        )
 7272                        if (
 7273                            field_ID not in self.get_header().infos
 7274                            and field in pzfields
 7275                        ):
 7276                            self.get_header().infos[field_ID] = vcf.parser._Info(
 7277                                field_ID,
 7278                                PZfields_INFOS[field]["Number"],
 7279                                PZfields_INFOS[field]["Type"],
 7280                                field_description,
 7281                                "unknown",
 7282                                "unknown",
 7283                                code_type_map[PZfields_INFOS[field]["Type"]],
 7284                            )
 7285
 7286            # Header
 7287            for pzfield in list_of_pzfields:
 7288                if re.match(f"{pz_prefix}Score.*", pzfield):
 7289                    added_column = self.add_column(
 7290                        table_name=table_variants,
 7291                        column_name=pzfield,
 7292                        column_type="INTEGER",
 7293                        default_value="0",
 7294                    )
 7295                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 7296                    added_column = self.add_column(
 7297                        table_name=table_variants,
 7298                        column_name=pzfield,
 7299                        column_type="BOOLEAN",
 7300                        default_value="1",
 7301                    )
 7302                elif re.match(f"{pz_prefix}Class.*", pzfield):
 7303                    added_column = self.add_column(
 7304                        table_name=table_variants,
 7305                        column_name=pzfield,
 7306                        column_type="VARCHAR[]",
 7307                        default_value="null",
 7308                    )
 7309                else:
 7310                    added_column = self.add_column(
 7311                        table_name=table_variants,
 7312                        column_name=pzfield,
 7313                        column_type="STRING",
 7314                        default_value="''",
 7315                    )
 7316                added_columns.append(added_column)
 7317
 7318            # Profiles
 7319            if profiles:
 7320
 7321                # foreach profile in configuration file
 7322                for profile in prioritizations_config:
 7323
 7324                    # If profile is asked in param, or ALL are asked (empty profile [])
 7325                    if profile in profiles or profiles == []:
 7326                        log.info(f"Profile '{profile}'")
 7327
 7328                        sql_set_info_option = ""
 7329
 7330                        sql_set_info = []
 7331
 7332                        # PZ fields set
 7333
 7334                        # PZScore
 7335                        if (
 7336                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 7337                            in list_of_pzfields
 7338                        ):
 7339                            sql_set_info.append(
 7340                                f"""
 7341                                    concat(
 7342                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 7343                                        {pz_prefix}Score{pzfields_sep}{profile}
 7344                                    ) 
 7345                                """
 7346                            )
 7347                            if (
 7348                                profile == default_profile
 7349                                and f"{pz_prefix}Score" in list_of_pzfields
 7350                            ):
 7351                                sql_set_info.append(
 7352                                    f"""
 7353                                        concat(
 7354                                            '{pz_prefix}Score=',
 7355                                            {pz_prefix}Score{pzfields_sep}{profile}
 7356                                        )
 7357                                    """
 7358                                )
 7359
 7360                        # PZFlag
 7361                        if (
 7362                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7363                            in list_of_pzfields
 7364                        ):
 7365                            sql_set_info.append(
 7366                                f"""
 7367                                    concat(
 7368                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7369                                        CASE 
 7370                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7371                                            THEN 'PASS'
 7372                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7373                                            THEN 'FILTERED'
 7374                                        END
 7375                                    ) 
 7376                                """
 7377                            )
 7378                            if (
 7379                                profile == default_profile
 7380                                and f"{pz_prefix}Flag" in list_of_pzfields
 7381                            ):
 7382                                sql_set_info.append(
 7383                                    f"""
 7384                                        concat(
 7385                                            '{pz_prefix}Flag=',
 7386                                            CASE 
 7387                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7388                                                THEN 'PASS'
 7389                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7390                                                THEN 'FILTERED'
 7391                                            END
 7392                                        )
 7393                                    """
 7394                                )
 7395
 7396                        # PZClass
 7397                        if (
 7398                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7399                            in list_of_pzfields
 7400                        ):
 7401                            sql_set_info.append(
 7402                                f"""
 7403                                    concat(
 7404                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7405                                        CASE
 7406                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7407                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7408                                            ELSE '.'
 7409                                        END 
 7410                                    )
 7411                                    
 7412                                """
 7413                            )
 7414                            if (
 7415                                profile == default_profile
 7416                                and f"{pz_prefix}Class" in list_of_pzfields
 7417                            ):
 7418                                sql_set_info.append(
 7419                                    f"""
 7420                                        concat(
 7421                                            '{pz_prefix}Class=',
 7422                                            CASE
 7423                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7424                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7425                                                ELSE '.'
 7426                                            END 
 7427                                        )
 7428                                    """
 7429                                )
 7430
 7431                        # PZComment
 7432                        if (
 7433                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7434                            in list_of_pzfields
 7435                        ):
 7436                            sql_set_info.append(
 7437                                f"""
 7438                                    CASE
 7439                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7440                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7441                                        ELSE ''
 7442                                    END
 7443                                """
 7444                            )
 7445                            if (
 7446                                profile == default_profile
 7447                                and f"{pz_prefix}Comment" in list_of_pzfields
 7448                            ):
 7449                                sql_set_info.append(
 7450                                    f"""
 7451                                        CASE
 7452                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7453                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7454                                            ELSE ''
 7455                                        END
 7456                                    """
 7457                                )
 7458
 7459                        # PZInfos
 7460                        if (
 7461                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7462                            in list_of_pzfields
 7463                        ):
 7464                            sql_set_info.append(
 7465                                f"""
 7466                                    CASE
 7467                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7468                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7469                                        ELSE ''
 7470                                    END
 7471                                """
 7472                            )
 7473                            if (
 7474                                profile == default_profile
 7475                                and f"{pz_prefix}Infos" in list_of_pzfields
 7476                            ):
 7477                                sql_set_info.append(
 7478                                    f"""
 7479                                        CASE
 7480                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7481                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7482                                            ELSE ''
 7483                                        END
 7484                                    """
 7485                                )
 7486
 7487                        # Merge PZfields
 7488                        sql_set_info_option = ""
 7489                        sql_set_sep = ""
 7490                        for sql_set in sql_set_info:
 7491                            if sql_set_sep:
 7492                                sql_set_info_option += f"""
 7493                                    , concat('{sql_set_sep}', {sql_set})
 7494                                """
 7495                            else:
 7496                                sql_set_info_option += f"""
 7497                                    , {sql_set}
 7498                                """
 7499                            sql_set_sep = ";"
 7500
 7501                        sql_queries = []
 7502                        criterion_fields_profile = []
 7503                        annotation_view_name = (
 7504                            "annotation_view_for_prioritization_"
 7505                            + str(random.randrange(1000))
 7506                        )
 7507                        annotation_view_prefix = ""
 7508                        for annotation in prioritizations_config[profile]:
 7509
 7510                            # skip special sections
 7511                            if annotation.startswith("_"):
 7512                                continue
 7513
 7514                            # For each criterions
 7515                            for criterion in prioritizations_config[profile][
 7516                                annotation
 7517                            ]:
 7518
 7519                                # Criterion mode
 7520                                criterion_mode = None
 7521                                if np.any(
 7522                                    np.isin(list(criterion.keys()), ["type", "value"])
 7523                                ):
 7524                                    criterion_mode = "operation"
 7525                                elif np.any(
 7526                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7527                                ):
 7528                                    criterion_mode = "sql"
 7529                                log.debug(f"Criterion Mode: {criterion_mode}")
 7530
 7531                                # Criterion parameters
 7532                                criterion_type = criterion.get("type", None)
 7533                                criterion_value = criterion.get("value", None)
 7534                                criterion_sql = criterion.get("sql", None)
 7535                                criterion_fields = criterion.get("fields", None)
 7536                                criterion_score = criterion.get("score", 0)
 7537                                criterion_flag = criterion.get("flag", "PASS")
 7538                                criterion_class = criterion.get("class", None)
 7539                                criterion_flag_bool = criterion_flag == "PASS"
 7540                                criterion_comment = (
 7541                                    ", ".join(criterion.get("comment", []))
 7542                                    .replace("'", "''")
 7543                                    .replace(";", ",")
 7544                                    .replace("\t", " ")
 7545                                )
 7546                                criterion_infos = (
 7547                                    str(criterion)
 7548                                    .replace("'", "''")
 7549                                    .replace(";", ",")
 7550                                    .replace("\t", " ")
 7551                                )
 7552
 7553                                # SQL
 7554                                if criterion_sql is not None and isinstance(
 7555                                    criterion_sql, list
 7556                                ):
 7557                                    criterion_sql = " ".join(criterion_sql)
 7558
 7559                                # Fields and explode
 7560                                if criterion_fields is None:
 7561                                    criterion_fields = [annotation]
 7562                                if not isinstance(criterion_fields, list):
 7563                                    criterion_fields = str(criterion_fields).split(",")
 7564
 7565                                # Class
 7566                                if criterion_class is not None and not isinstance(
 7567                                    criterion_class, list
 7568                                ):
 7569                                    criterion_class = str(criterion_class).split(",")
 7570
 7571                                # Add criterion fields to the list of profile's criteria
 7572                                criterion_fields_profile = list(
 7573                                    set(criterion_fields_profile + criterion_fields)
 7574                                )
 7575
 7576                                sql_set = []
 7577                                sql_set_info = []
 7578
 7579                                # PZ fields set
 7580
 7581                                # PZScore
 7582                                if (
 7583                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7584                                    in list_of_pzfields
 7585                                ):
 7586                                    # VaRank prioritization score mode
 7587                                    if prioritization_score_mode.upper().strip() in [
 7588                                        "VARANK",
 7589                                        "MAX",
 7590                                        "MAXIMUM",
 7591                                        "TOP",
 7592                                    ]:
 7593                                        sql_set.append(
 7594                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
 7595                                        )
 7596                                    # default HOWARD prioritization score mode
 7597                                    else:
 7598                                        sql_set.append(
 7599                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7600                                        )
 7601
 7602                                # PZFlag
 7603                                if (
 7604                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7605                                    in list_of_pzfields
 7606                                ):
 7607                                    sql_set.append(
 7608                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7609                                    )
 7610
 7611                                # PZClass
 7612                                if (
 7613                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7614                                    in list_of_pzfields
 7615                                    and criterion_class is not None
 7616                                ):
 7617                                    sql_set.append(
 7618                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7619                                    )
 7620
 7621                                # PZComment
 7622                                if (
 7623                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7624                                    in list_of_pzfields
 7625                                ):
 7626                                    sql_set.append(
 7627                                        f"""
 7628                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7629                                                concat(
 7630                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7631                                                    CASE 
 7632                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7633                                                        THEN ', '
 7634                                                        ELSE ''
 7635                                                    END,
 7636                                                    '{criterion_comment}'
 7637                                                )
 7638                                        """
 7639                                    )
 7640
 7641                                # PZInfos
 7642                                if (
 7643                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7644                                    in list_of_pzfields
 7645                                ):
 7646                                    sql_set.append(
 7647                                        f"""
 7648                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7649                                                concat(
 7650                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7651                                                    '{criterion_infos}'
 7652                                                )
 7653                                        """
 7654                                    )
 7655                                sql_set_option = ",".join(sql_set)
 7656
 7657                                # Criterion and comparison
 7658                                if sql_set_option:
 7659
 7660                                    # Operation mode
 7661                                    if criterion_mode in ["operation"]:
 7662
 7663                                        # Check if value is a float
 7664                                        try:
 7665                                            float(criterion_value)
 7666                                            sql_update = f"""
 7667                                                UPDATE "{table_variants}"
 7668                                                SET {sql_set_option}
 7669                                                FROM (
 7670                                                    SELECT *
 7671                                                    FROM "{annotation_view_name}"
 7672                                                    WHERE (
 7673                                                        CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7674                                                        AND   CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7675                                                        )
 7676                                                    ) AS "{annotation_view_name}"
 7677                                                WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
 7678                                                  AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
 7679                                                  AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
 7680                                                  AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
 7681                                                
 7682                                            """
 7683                                        # If not a floatÃ’
 7684                                        except:
 7685                                            contains_option = ""
 7686                                            if criterion_type == "contains":
 7687                                                contains_option = ".*"
 7688                                            sql_update = f"""
 7689                                                UPDATE "{table_variants}"
 7690                                                SET {sql_set_option}
 7691                                                FROM (
 7692                                                    SELECT *
 7693                                                    FROM "{annotation_view_name}"
 7694                                                    WHERE (
 7695                                                        "{annotation_view_name}"."{annotation_view_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7696                                                        )
 7697                                                    ) AS "{annotation_view_name}"
 7698                                                WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
 7699                                                  AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
 7700                                                  AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
 7701                                                  AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
 7702                                                  
 7703                                            """
 7704                                        sql_queries.append(sql_update)
 7705
 7706                                    # SQL mode
 7707                                    elif criterion_mode in ["sql"]:
 7708
 7709                                        sql_update = f"""
 7710                                            UPDATE {table_variants}
 7711                                            SET {sql_set_option}
 7712                                            FROM (
 7713                                                SELECT *
 7714                                                FROM "{annotation_view_name}"
 7715                                                WHERE ({criterion_sql})
 7716                                                ) AS "{annotation_view_name}"
 7717                                            WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
 7718                                                AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
 7719                                                AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
 7720                                                AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
 7721                                        """
 7722                                        sql_queries.append(sql_update)
 7723
 7724                                    else:
 7725                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7726                                        log.error(msg_err)
 7727                                        raise ValueError(msg_err)
 7728
 7729                                else:
 7730                                    log.warning(
 7731                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7732                                    )
 7733
 7734                        # PZTags
 7735                        if (
 7736                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7737                            in list_of_pzfields
 7738                        ):
 7739
 7740                            # Create PZFalgs value
 7741                            pztags_value = ""
 7742                            pztags_sep_default = ","
 7743                            pztags_sep = ""
 7744                            for pzfield in pzfields:
 7745                                if pzfield not in [f"{pz_prefix}Tags"]:
 7746                                    if (
 7747                                        f"{pzfield}{pzfields_sep}{profile}"
 7748                                        in list_of_pzfields
 7749                                    ):
 7750                                        if pzfield in [f"{pz_prefix}Flag"]:
 7751                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7752                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7753                                                    THEN 'PASS'
 7754                                                    ELSE 'FILTERED'
 7755                                                END, '"""
 7756                                        elif pzfield in [f"{pz_prefix}Class"]:
 7757                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7758                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7759                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7760                                                    ELSE '.'
 7761                                                END, '"""
 7762                                        else:
 7763                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7764                                        pztags_sep = pztags_sep_default
 7765
 7766                            # Add Query update for PZFlags
 7767                            sql_update_pztags = f"""
 7768                                UPDATE {table_variants}
 7769                                SET INFO = concat(
 7770                                        INFO,
 7771                                        CASE WHEN INFO NOT in ('','.')
 7772                                                THEN ';'
 7773                                                ELSE ''
 7774                                        END,
 7775                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7776                                    )
 7777                                WHERE 1=1
 7778                                """
 7779                            sql_queries.append(sql_update_pztags)
 7780
 7781                            # Add Query update for PZFlags for default
 7782                            if profile == default_profile:
 7783                                sql_update_pztags_default = f"""
 7784                                UPDATE {table_variants}
 7785                                SET INFO = concat(
 7786                                        INFO,
 7787                                        ';',
 7788                                        '{pz_prefix}Tags={pztags_value}'
 7789                                    )
 7790                                    WHERE 1=1
 7791                                """
 7792                                sql_queries.append(sql_update_pztags_default)
 7793
 7794                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7795
 7796                        # Create annotations view for prioritization
 7797                        log.debug(
 7798                            f"""Profile '{profile}' - Prioritization - Create '{annotation_view_name}' view with '{criterion_fields_profile}'... """
 7799                        )
 7800                        annotation_view = self.create_annotations_view(
 7801                            view=annotation_view_name,
 7802                            prefix=annotation_view_prefix,
 7803                            fields=criterion_fields_profile,
 7804                            drop_view=True,
 7805                        )
 7806
 7807                        # Chromosomes list
 7808                        sql_uniq_chrom = f"""
 7809                            SELECT DISTINCT "#CHROM"
 7810                            FROM {table_variants}
 7811                        """
 7812                        chroms = self.get_query_to_df(sql_uniq_chrom)["#CHROM"].tolist()
 7813
 7814                        for chrom in chroms:
 7815
 7816                            log.debug(
 7817                                f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}'... """
 7818                            )
 7819
 7820                            if sql_queries:
 7821
 7822                                # Query num
 7823                                num_query = 0
 7824
 7825                                # For each query
 7826                                for sql_query in sql_queries:
 7827
 7828                                    # Query num
 7829                                    num_query += 1
 7830
 7831                                    sql_query_chrom = f"""
 7832                                        {sql_query}
 7833                                        AND {table_variants}."#CHROM" LIKE '{chrom}' 
 7834                                    """
 7835                                    log.debug(
 7836                                        f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}' [{num_query}/{len(sql_queries)}]"""
 7837                                    )
 7838                                    # log.debug(f"""sql_query_chrom: {sql_query_chrom}""")
 7839                                    self.execute_query(query=sql_query_chrom)
 7840
 7841                        # Update INFO field
 7842                        log.info(f"""Profile '{profile}' - Update... """)
 7843                        sql_query_update = f"""
 7844                            UPDATE {table_variants}
 7845                            SET INFO =  
 7846                                concat(
 7847                                    CASE
 7848                                        WHEN INFO NOT IN ('','.')
 7849                                        THEN concat(INFO, ';')
 7850                                        ELSE ''
 7851                                    END
 7852                                    {sql_set_info_option}
 7853                                )
 7854                        """
 7855                        # log.debug(f"sql_query_update={sql_query_update}")
 7856                        self.execute_query(query=sql_query_update)
 7857
 7858                        # Remove annotations view for prioritization
 7859                        query_drop_tmp_table = f"""
 7860                            DROP VIEW IF EXISTS {annotation_view_name}
 7861                        """
 7862                        self.execute_query(query=query_drop_tmp_table)
 7863
 7864        else:
 7865
 7866            log.warning(f"No profiles in parameters")
 7867
 7868        # Remove added columns
 7869        for added_column in added_columns:
 7870            self.drop_column(column=added_column)
 7871
 7872        # Explode INFOS fields into table fields
 7873        if self.get_explode_infos():
 7874            self.explode_infos(
 7875                prefix=self.get_explode_infos_prefix(),
 7876                fields=self.get_explode_infos_fields(),
 7877                force=True,
 7878            )
 7879
 7880        return True
 7881
 7882    ###
 7883    # HGVS
 7884    ###
 7885
 7886    def annotation_hgvs(self, threads: int = None) -> None:
 7887        """
 7888        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7889        coordinates and alleles.
 7890
 7891        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7892        threads to use for parallel processing. If no value is provided, it will default to the number
 7893        of threads obtained from the `get_threads()` method
 7894        :type threads: int
 7895        """
 7896
 7897        # Function for each partition of the Dask Dataframe
 7898        def partition_function(partition):
 7899            """
 7900            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7901            each row of a DataFrame called `partition`.
 7902
 7903            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7904            to be processed
 7905            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7906            the "partition" dataframe along the axis 1.
 7907            """
 7908            return partition.apply(annotation_hgvs_partition, axis=1)
 7909
 7910        def annotation_hgvs_partition(row) -> str:
 7911            """
 7912            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7913            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7914
 7915            :param row: A dictionary-like object that contains the values for the following keys:
 7916            :return: a string that contains the HGVS names associated with the given row of data.
 7917            """
 7918
 7919            chr = row["CHROM"]
 7920            pos = row["POS"]
 7921            ref = row["REF"]
 7922            alt = row["ALT"]
 7923
 7924            # Find list of associated transcripts
 7925            transcripts_list = list(
 7926                polars_conn.execute(
 7927                    f"""
 7928                SELECT transcript
 7929                FROM refseq_df
 7930                WHERE CHROM='{chr}'
 7931                AND POS={pos}
 7932            """
 7933                )["transcript"]
 7934            )
 7935
 7936            # Full HGVS annotation in list
 7937            hgvs_full_list = []
 7938
 7939            for transcript_name in transcripts_list:
 7940
 7941                # Transcript
 7942                transcript = get_transcript(
 7943                    transcripts=transcripts, transcript_name=transcript_name
 7944                )
 7945                # Exon
 7946                if use_exon:
 7947                    exon = transcript.find_exon_number(pos)
 7948                else:
 7949                    exon = None
 7950                # Protein
 7951                transcript_protein = None
 7952                if use_protein or add_protein or full_format:
 7953                    transcripts_protein = list(
 7954                        polars_conn.execute(
 7955                            f"""
 7956                        SELECT protein
 7957                        FROM refseqlink_df
 7958                        WHERE transcript='{transcript_name}'
 7959                        LIMIT 1
 7960                    """
 7961                        )["protein"]
 7962                    )
 7963                    if len(transcripts_protein):
 7964                        transcript_protein = transcripts_protein[0]
 7965
 7966                # HGVS name
 7967                hgvs_name = format_hgvs_name(
 7968                    chr,
 7969                    pos,
 7970                    ref,
 7971                    alt,
 7972                    genome=genome,
 7973                    transcript=transcript,
 7974                    transcript_protein=transcript_protein,
 7975                    exon=exon,
 7976                    use_gene=use_gene,
 7977                    use_protein=use_protein,
 7978                    full_format=full_format,
 7979                    use_version=use_version,
 7980                    codon_type=codon_type,
 7981                )
 7982                hgvs_full_list.append(hgvs_name)
 7983                if add_protein and not use_protein and not full_format:
 7984                    hgvs_name = format_hgvs_name(
 7985                        chr,
 7986                        pos,
 7987                        ref,
 7988                        alt,
 7989                        genome=genome,
 7990                        transcript=transcript,
 7991                        transcript_protein=transcript_protein,
 7992                        exon=exon,
 7993                        use_gene=use_gene,
 7994                        use_protein=True,
 7995                        full_format=False,
 7996                        use_version=use_version,
 7997                        codon_type=codon_type,
 7998                    )
 7999                    hgvs_full_list.append(hgvs_name)
 8000
 8001            # Create liste of HGVS annotations
 8002            hgvs_full = ",".join(hgvs_full_list)
 8003
 8004            return hgvs_full
 8005
 8006        # Polars connexion
 8007        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8008
 8009        # Config
 8010        config = self.get_config()
 8011
 8012        # Databases
 8013        # Genome
 8014        databases_genomes_folders = (
 8015            config.get("folders", {})
 8016            .get("databases", {})
 8017            .get("genomes", DEFAULT_GENOME_FOLDER)
 8018        )
 8019        databases_genome = (
 8020            config.get("folders", {}).get("databases", {}).get("genomes", "")
 8021        )
 8022        # refseq database folder
 8023        databases_refseq_folders = (
 8024            config.get("folders", {})
 8025            .get("databases", {})
 8026            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 8027        )
 8028        # refseq
 8029        databases_refseq = config.get("databases", {}).get("refSeq", None)
 8030        # refSeqLink
 8031        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 8032
 8033        # Param
 8034        param = self.get_param()
 8035
 8036        # Quick HGVS
 8037        if "hgvs_options" in param and param.get("hgvs_options", ""):
 8038            log.info(f"Quick HGVS Annotation:")
 8039            if not param.get("hgvs", None):
 8040                param["hgvs"] = {}
 8041            for option in param.get("hgvs_options", "").split(","):
 8042                option_var_val = option.split("=")
 8043                option_var = option_var_val[0]
 8044                if len(option_var_val) > 1:
 8045                    option_val = option_var_val[1]
 8046                else:
 8047                    option_val = "True"
 8048                if option_val.upper() in ["TRUE"]:
 8049                    option_val = True
 8050                elif option_val.upper() in ["FALSE"]:
 8051                    option_val = False
 8052                log.info(f"   {option_var}={option_val}")
 8053                param["hgvs"][option_var] = option_val
 8054
 8055        # Check if HGVS annotation enabled
 8056        if "hgvs" in param:
 8057            log.info(f"HGVS Annotation... ")
 8058            for hgvs_option in param.get("hgvs", {}):
 8059                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 8060        else:
 8061            return
 8062
 8063        # HGVS Param
 8064        param_hgvs = param.get("hgvs", {})
 8065        use_exon = param_hgvs.get("use_exon", False)
 8066        use_gene = param_hgvs.get("use_gene", False)
 8067        use_protein = param_hgvs.get("use_protein", False)
 8068        add_protein = param_hgvs.get("add_protein", False)
 8069        full_format = param_hgvs.get("full_format", False)
 8070        use_version = param_hgvs.get("use_version", False)
 8071        codon_type = param_hgvs.get("codon_type", "3")
 8072
 8073        # refSseq refSeqLink
 8074        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 8075        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 8076
 8077        # Assembly
 8078        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 8079
 8080        # Genome
 8081        genome_file = None
 8082        if find_genome(databases_genome):
 8083            genome_file = find_genome(databases_genome)
 8084        else:
 8085            genome_file = find_genome(
 8086                genome_path=databases_genomes_folders, assembly=assembly
 8087            )
 8088        log.debug("Genome: " + str(genome_file))
 8089
 8090        # refSseq
 8091        refseq_file = find_file_prefix(
 8092            input_file=databases_refseq,
 8093            prefix="ncbiRefSeq",
 8094            folder=databases_refseq_folders,
 8095            assembly=assembly,
 8096        )
 8097        log.debug("refSeq: " + str(refseq_file))
 8098
 8099        # refSeqLink
 8100        refseqlink_file = find_file_prefix(
 8101            input_file=databases_refseqlink,
 8102            prefix="ncbiRefSeqLink",
 8103            folder=databases_refseq_folders,
 8104            assembly=assembly,
 8105        )
 8106        log.debug("refSeqLink: " + str(refseqlink_file))
 8107
 8108        # Threads
 8109        if not threads:
 8110            threads = self.get_threads()
 8111        log.debug("Threads: " + str(threads))
 8112
 8113        # Variables
 8114        table_variants = self.get_table_variants(clause="update")
 8115
 8116        # Get variants SNV and InDel only
 8117        query_variants = f"""
 8118            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 8119            FROM {table_variants}
 8120            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 8121            """
 8122        df_variants = self.get_query_to_df(query_variants)
 8123
 8124        # Added columns
 8125        added_columns = []
 8126
 8127        # Add hgvs column in variants table
 8128        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 8129        added_column = self.add_column(
 8130            table_variants, hgvs_column_name, "STRING", default_value=None
 8131        )
 8132        added_columns.append(added_column)
 8133
 8134        log.debug(f"refSeq loading...")
 8135        # refSeq in duckDB
 8136        refseq_table = get_refseq_table(
 8137            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 8138        )
 8139        # Loading all refSeq in Dataframe
 8140        refseq_query = f"""
 8141            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 8142            FROM {refseq_table}
 8143            JOIN df_variants ON (
 8144                {refseq_table}.chrom = df_variants.CHROM
 8145                AND {refseq_table}.txStart<=df_variants.POS
 8146                AND {refseq_table}.txEnd>=df_variants.POS
 8147            )
 8148        """
 8149        refseq_df = self.conn.query(refseq_query).pl()
 8150
 8151        if refseqlink_file:
 8152            log.debug(f"refSeqLink loading...")
 8153            # refSeqLink in duckDB
 8154            refseqlink_table = get_refseq_table(
 8155                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 8156            )
 8157            # Loading all refSeqLink in Dataframe
 8158            protacc_column = "protAcc_with_ver"
 8159            mrnaacc_column = "mrnaAcc_with_ver"
 8160            refseqlink_query = f"""
 8161                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 8162                FROM {refseqlink_table} 
 8163                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 8164                WHERE protAcc_without_ver IS NOT NULL
 8165            """
 8166            # Polars Dataframe
 8167            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 8168
 8169        # Read RefSeq transcripts into a python dict/model.
 8170        log.debug(f"Transcripts loading...")
 8171        with tempfile.TemporaryDirectory() as tmpdir:
 8172            transcripts_query = f"""
 8173                COPY (
 8174                    SELECT {refseq_table}.*
 8175                    FROM {refseq_table}
 8176                    JOIN df_variants ON (
 8177                        {refseq_table}.chrom=df_variants.CHROM
 8178                        AND {refseq_table}.txStart<=df_variants.POS
 8179                        AND {refseq_table}.txEnd>=df_variants.POS
 8180                    )
 8181                )
 8182                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 8183            """
 8184            self.conn.query(transcripts_query)
 8185            with open(f"{tmpdir}/transcript.tsv") as infile:
 8186                transcripts = read_transcripts(infile)
 8187
 8188        # Polars connexion
 8189        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 8190
 8191        log.debug("Genome loading...")
 8192        # Read genome sequence using pyfaidx.
 8193        genome = Fasta(genome_file)
 8194
 8195        log.debug("Start annotation HGVS...")
 8196
 8197        # Create
 8198        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 8199        ddf = dd.from_pandas(df_variants, npartitions=threads)
 8200
 8201        # Use dask.dataframe.apply() to apply function on each partition
 8202        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 8203
 8204        # Convert Dask DataFrame to Pandas Dataframe
 8205        df = ddf.compute()
 8206
 8207        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 8208        with tempfile.TemporaryDirectory() as tmpdir:
 8209            df_parquet = os.path.join(tmpdir, "df.parquet")
 8210            df.to_parquet(df_parquet)
 8211
 8212            # Update hgvs column
 8213            update_variant_query = f"""
 8214                UPDATE {table_variants}
 8215                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 8216                FROM read_parquet('{df_parquet}') as df
 8217                WHERE variants."#CHROM" = df.CHROM
 8218                AND variants.POS = df.POS
 8219                AND variants.REF = df.REF
 8220                AND variants.ALT = df.ALT
 8221                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 8222                """
 8223            self.execute_query(update_variant_query)
 8224
 8225        # Update INFO column
 8226        sql_query_update = f"""
 8227            UPDATE {table_variants}
 8228            SET INFO = 
 8229                concat(
 8230                    CASE 
 8231                        WHEN INFO NOT IN ('','.')
 8232                        THEN concat(INFO, ';')
 8233                        ELSE ''
 8234                    END,
 8235                    'hgvs=',
 8236                    {hgvs_column_name}
 8237                )
 8238            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 8239            """
 8240        self.execute_query(sql_query_update)
 8241
 8242        # Add header
 8243        HGVS_INFOS = {
 8244            "hgvs": {
 8245                "ID": "hgvs",
 8246                "Number": ".",
 8247                "Type": "String",
 8248                "Description": f"HGVS annotatation with HOWARD",
 8249            }
 8250        }
 8251
 8252        for field in HGVS_INFOS:
 8253            field_ID = HGVS_INFOS[field]["ID"]
 8254            field_description = HGVS_INFOS[field]["Description"]
 8255            self.get_header().infos[field_ID] = vcf.parser._Info(
 8256                field_ID,
 8257                HGVS_INFOS[field]["Number"],
 8258                HGVS_INFOS[field]["Type"],
 8259                field_description,
 8260                "unknown",
 8261                "unknown",
 8262                code_type_map[HGVS_INFOS[field]["Type"]],
 8263            )
 8264
 8265        # Remove added columns
 8266        for added_column in added_columns:
 8267            self.drop_column(column=added_column)
 8268
 8269    ###
 8270    # Calculation
 8271    ###
 8272
 8273    def get_operations_help(
 8274        self, operations_config_dict: dict = {}, operations_config_file: str = None
 8275    ) -> list:
 8276
 8277        # Init
 8278        operations_help = []
 8279
 8280        # operations
 8281        operations = self.get_config_json(
 8282            name="calculations",
 8283            config_dict=operations_config_dict,
 8284            config_file=operations_config_file,
 8285        )
 8286        for op in operations:
 8287            op_name = operations[op].get("name", op).upper()
 8288            op_description = operations[op].get("description", op_name)
 8289            op_available = operations[op].get("available", False)
 8290            if op_available:
 8291                operations_help.append(f"   {op_name}: {op_description}")
 8292
 8293        # Sort operations
 8294        operations_help.sort()
 8295
 8296        # insert header
 8297        operations_help.insert(0, "Available calculation operations:")
 8298
 8299        # Return
 8300        return operations_help
 8301
 8302    def calculation(
 8303        self,
 8304        operations: dict = {},
 8305        operations_config_dict: dict = {},
 8306        operations_config_file: str = None,
 8307    ) -> None:
 8308        """
 8309        It takes a list of operations, and for each operation, it checks if it's a python or sql
 8310        operation, and then calls the appropriate function
 8311
 8312        param json example:
 8313            "calculation": {
 8314                "NOMEN": {
 8315                    "options": {
 8316                        "hgvs_field": "hgvs"
 8317                    },
 8318                "middle" : null
 8319            }
 8320        """
 8321
 8322        # Param
 8323        param = self.get_param()
 8324
 8325        # CHeck operations config file
 8326        if operations_config_file is None:
 8327            operations_config_file = param.get("calculation", {}).get(
 8328                "calculation_config", None
 8329            )
 8330
 8331        # operations config
 8332        operations_config = self.get_config_json(
 8333            name="calculations",
 8334            config_dict=operations_config_dict,
 8335            config_file=operations_config_file,
 8336        )
 8337
 8338        # Upper keys
 8339        operations_config = {k.upper(): v for k, v in operations_config.items()}
 8340
 8341        # Calculations
 8342
 8343        # Operations from param
 8344        operations = param.get("calculation", {}).get("calculations", operations)
 8345
 8346        # Quick calculation - add
 8347        if param.get("calculations", None):
 8348
 8349            # List of operations
 8350            calculations_list = [
 8351                value.strip() for value in param.get("calculations", "").split(",")
 8352            ]
 8353
 8354            # Log
 8355            log.info(f"Quick Calculations:")
 8356            for calculation_key in calculations_list:
 8357                log.info(f"   {calculation_key}")
 8358
 8359            # Create tmp operations (to keep operation order)
 8360            operations_tmp = {}
 8361            for calculation_operation in calculations_list:
 8362                if calculation_operation.upper() not in operations_tmp:
 8363                    log.debug(
 8364                        f"{calculation_operation}.upper() not in {operations_tmp}"
 8365                    )
 8366                    operations_tmp[calculation_operation.upper()] = {}
 8367                    add_value_into_dict(
 8368                        dict_tree=operations_tmp,
 8369                        sections=[
 8370                            calculation_operation.upper(),
 8371                        ],
 8372                        value=operations.get(calculation_operation.upper(), {}),
 8373                    )
 8374            # Add operations already in param
 8375            for calculation_operation in operations:
 8376                if calculation_operation not in operations_tmp:
 8377                    operations_tmp[calculation_operation] = operations.get(
 8378                        calculation_operation, {}
 8379                    )
 8380
 8381            # Update operations in param
 8382            operations = operations_tmp
 8383
 8384        # Operations for calculation
 8385        if not operations:
 8386            operations = param.get("calculation", {}).get("calculations", {})
 8387
 8388        if operations:
 8389            log.info(f"Calculations...")
 8390
 8391        # For each operations
 8392        for operation_name in operations:
 8393            operation_name = operation_name.upper()
 8394            if operation_name not in [""]:
 8395                if operation_name in operations_config:
 8396                    log.info(f"Calculation '{operation_name}'")
 8397                    operation = operations_config[operation_name]
 8398                    operation_type = operation.get("type", "sql")
 8399                    if operation_type == "python":
 8400                        self.calculation_process_function(
 8401                            operation=operation, operation_name=operation_name
 8402                        )
 8403                    elif operation_type == "sql":
 8404                        self.calculation_process_sql(
 8405                            operation=operation, operation_name=operation_name
 8406                        )
 8407                    else:
 8408                        log.error(
 8409                            f"Operations config: Type '{operation_type}' NOT available"
 8410                        )
 8411                        raise ValueError(
 8412                            f"Operations config: Type '{operation_type}' NOT available"
 8413                        )
 8414                else:
 8415                    log.error(
 8416                        f"Operations config: Calculation '{operation_name}' NOT available"
 8417                    )
 8418                    raise ValueError(
 8419                        f"Operations config: Calculation '{operation_name}' NOT available"
 8420                    )
 8421
 8422        # Explode INFOS fields into table fields
 8423        if self.get_explode_infos():
 8424            self.explode_infos(
 8425                prefix=self.get_explode_infos_prefix(),
 8426                fields=self.get_explode_infos_fields(),
 8427                force=True,
 8428            )
 8429
 8430    def calculation_process_sql(
 8431        self, operation: dict, operation_name: str = "unknown"
 8432    ) -> None:
 8433        """
 8434        The `calculation_process_sql` function takes in a mathematical operation as a string and
 8435        performs the operation, updating the specified table with the result.
 8436
 8437        :param operation: The `operation` parameter is a dictionary that contains information about the
 8438        mathematical operation to be performed. It includes the following keys:
 8439        :type operation: dict
 8440        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8441        the mathematical operation being performed. It is used for logging and error handling purposes,
 8442        defaults to unknown
 8443        :type operation_name: str (optional)
 8444        """
 8445
 8446        # Operation infos
 8447        operation_name = operation.get("name", "unknown")
 8448        log.debug(f"process SQL {operation_name}")
 8449        output_column_name = operation.get("output_column_name", operation_name)
 8450        output_column_type = operation.get("output_column_type", "String")
 8451        prefix = operation.get("explode_infos_prefix", "")
 8452        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8453        output_column_description = operation.get(
 8454            "output_column_description", f"{operation_name} operation"
 8455        )
 8456        operation_query = operation.get("operation_query", None)
 8457        if isinstance(operation_query, list):
 8458            operation_query = " ".join(operation_query)
 8459        operation_info_fields = operation.get("info_fields", [])
 8460        operation_info_fields_check = operation.get("info_fields_check", False)
 8461        operation_info = operation.get("operation_info", True)
 8462        operation_table = operation.get(
 8463            "table", self.get_table_variants(clause="alter")
 8464        )
 8465
 8466        # table variants
 8467        if operation_table:
 8468            table_variants = operation_table
 8469        else:
 8470            table_variants = self.get_table_variants(clause="alter")
 8471
 8472        if operation_query:
 8473
 8474            # Info fields check
 8475            operation_info_fields_check_result = True
 8476            if operation_info_fields_check:
 8477                header_infos = self.get_header().infos
 8478                for info_field in operation_info_fields:
 8479                    operation_info_fields_check_result = (
 8480                        operation_info_fields_check_result
 8481                        and info_field in header_infos
 8482                    )
 8483
 8484            # If info fields available
 8485            if operation_info_fields_check_result:
 8486
 8487                # Added_columns
 8488                added_columns = []
 8489
 8490                # Create VCF header field
 8491                vcf_reader = self.get_header()
 8492                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8493                    output_column_name,
 8494                    ".",
 8495                    output_column_type,
 8496                    output_column_description,
 8497                    "howard calculation",
 8498                    "0",
 8499                    self.code_type_map.get(output_column_type),
 8500                )
 8501
 8502                # Explode infos if needed
 8503                log.debug(f"calculation_process_sql prefix {prefix}")
 8504                added_columns += self.explode_infos(
 8505                    prefix=prefix,
 8506                    fields=[output_column_name] + operation_info_fields,
 8507                    force=False,
 8508                    table=table_variants,
 8509                )
 8510
 8511                # Create column
 8512                added_column = self.add_column(
 8513                    table_name=table_variants,
 8514                    column_name=prefix + output_column_name,
 8515                    column_type=output_column_type_sql,
 8516                    default_value="null",
 8517                )
 8518                added_columns.append(added_column)
 8519
 8520                # Operation calculation
 8521                try:
 8522
 8523                    # Query to update calculation column
 8524                    sql_update = f"""
 8525                        UPDATE {table_variants}
 8526                        SET "{prefix}{output_column_name}" = ({operation_query})
 8527                    """
 8528                    self.conn.execute(sql_update)
 8529
 8530                    # Add to INFO
 8531                    if operation_info:
 8532                        sql_update_info = f"""
 8533                            UPDATE {table_variants}
 8534                            SET "INFO" =
 8535                                concat(
 8536                                    CASE
 8537                                        WHEN "INFO" IS NOT NULL
 8538                                        THEN concat("INFO", ';')
 8539                                        ELSE ''
 8540                                    END,
 8541                                    '{output_column_name}=',
 8542                                    "{prefix}{output_column_name}"
 8543                                )
 8544                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8545                        """
 8546                        self.conn.execute(sql_update_info)
 8547
 8548                except:
 8549                    log.error(
 8550                        f"Operations config: Calculation '{operation_name}' query failed"
 8551                    )
 8552                    raise ValueError(
 8553                        f"Operations config: Calculation '{operation_name}' query failed"
 8554                    )
 8555
 8556                # Remove added columns
 8557                for added_column in added_columns:
 8558                    log.debug(f"added_column: {added_column}")
 8559                    self.drop_column(column=added_column)
 8560
 8561            else:
 8562                log.error(
 8563                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8564                )
 8565                raise ValueError(
 8566                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8567                )
 8568
 8569        else:
 8570            log.error(
 8571                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8572            )
 8573            raise ValueError(
 8574                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8575            )
 8576
 8577    def calculation_process_function(
 8578        self, operation: dict, operation_name: str = "unknown"
 8579    ) -> None:
 8580        """
 8581        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8582        function with the given parameters.
 8583
 8584        :param operation: The `operation` parameter is a dictionary that contains information about the
 8585        operation to be performed. It has the following keys:
 8586        :type operation: dict
 8587        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8588        the operation being performed. It is used for logging purposes, defaults to unknown
 8589        :type operation_name: str (optional)
 8590        """
 8591
 8592        operation_name = operation["name"]
 8593        log.debug(f"process Python {operation_name}")
 8594        function_name = operation["function_name"]
 8595        function_params = operation["function_params"]
 8596        getattr(self, function_name)(*function_params)
 8597
 8598    def calculation_variant_id(self) -> None:
 8599        """
 8600        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8601        updates the INFO field of a variants table with the variant ID.
 8602        """
 8603
 8604        # variant_id annotation field
 8605        variant_id_tag = self.get_variant_id_column()
 8606        added_columns = [variant_id_tag]
 8607
 8608        # variant_id hgvs tags"
 8609        vcf_infos_tags = {
 8610            variant_id_tag: "howard variant ID annotation",
 8611        }
 8612
 8613        # Variants table
 8614        table_variants = self.get_table_variants()
 8615
 8616        # Header
 8617        vcf_reader = self.get_header()
 8618
 8619        # Add variant_id to header
 8620        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8621            variant_id_tag,
 8622            ".",
 8623            "String",
 8624            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8625            "howard calculation",
 8626            "0",
 8627            self.code_type_map.get("String"),
 8628        )
 8629
 8630        # Update
 8631        sql_update = f"""
 8632            UPDATE {table_variants}
 8633            SET "INFO" = 
 8634                concat(
 8635                    CASE
 8636                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8637                        THEN ''
 8638                        ELSE concat("INFO", ';')
 8639                    END,
 8640                    '{variant_id_tag}=',
 8641                    "{variant_id_tag}"
 8642                )
 8643        """
 8644        self.conn.execute(sql_update)
 8645
 8646        # Remove added columns
 8647        for added_column in added_columns:
 8648            self.drop_column(column=added_column)
 8649
 8650    def calculation_extract_snpeff_hgvs(
 8651        self,
 8652        snpeff_hgvs: str = "snpeff_hgvs",
 8653        snpeff_field: str = "ANN",
 8654    ) -> None:
 8655        """
 8656        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8657        annotation field in a VCF file and adds them as a new column in the variants table.
 8658
 8659        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8660        function is used to specify the name of the column that will store the HGVS nomenclatures
 8661        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8662        snpeff_hgvs
 8663        :type snpeff_hgvs: str (optional)
 8664        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8665        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8666        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8667        to ANN
 8668        :type snpeff_field: str (optional)
 8669        """
 8670
 8671        # Snpeff hgvs tags
 8672        vcf_infos_tags = {
 8673            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8674        }
 8675
 8676        # Prefix
 8677        prefix = self.get_explode_infos_prefix()
 8678        if prefix:
 8679            prefix = "INFO/"
 8680
 8681        # snpEff fields
 8682        speff_ann_infos = prefix + snpeff_field
 8683        speff_hgvs_infos = prefix + snpeff_hgvs
 8684
 8685        # Variants table
 8686        table_variants = self.get_table_variants()
 8687
 8688        # Header
 8689        vcf_reader = self.get_header()
 8690
 8691        # Add columns
 8692        added_columns = []
 8693
 8694        # Explode HGVS field in column
 8695        added_columns += self.explode_infos(fields=[snpeff_field])
 8696
 8697        if snpeff_field in vcf_reader.infos:
 8698
 8699            log.debug(vcf_reader.infos[snpeff_field])
 8700
 8701            # Extract ANN header
 8702            ann_description = vcf_reader.infos[snpeff_field].desc
 8703            pattern = r"'(.+?)'"
 8704            match = re.search(pattern, ann_description)
 8705            if match:
 8706                ann_header_match = match.group(1).split(" | ")
 8707                ann_header_desc = {}
 8708                for i in range(len(ann_header_match)):
 8709                    ann_header_info = "".join(
 8710                        char for char in ann_header_match[i] if char.isalnum()
 8711                    )
 8712                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8713                if not ann_header_desc:
 8714                    raise ValueError("Invalid header description format")
 8715            else:
 8716                raise ValueError("Invalid header description format")
 8717
 8718            # Create variant id
 8719            variant_id_column = self.get_variant_id_column()
 8720            added_columns += [variant_id_column]
 8721
 8722            # Create dataframe
 8723            dataframe_snpeff_hgvs = self.get_query_to_df(
 8724                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8725            )
 8726
 8727            # Create main NOMEN column
 8728            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8729                speff_ann_infos
 8730            ].apply(
 8731                lambda x: extract_snpeff_hgvs(
 8732                    str(x), header=list(ann_header_desc.values())
 8733                )
 8734            )
 8735
 8736            # Add snpeff_hgvs to header
 8737            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8738                snpeff_hgvs,
 8739                ".",
 8740                "String",
 8741                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8742                "howard calculation",
 8743                "0",
 8744                self.code_type_map.get("String"),
 8745            )
 8746
 8747            # Update
 8748            sql_update = f"""
 8749                UPDATE variants
 8750                SET "INFO" = 
 8751                    concat(
 8752                        CASE
 8753                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8754                            THEN ''
 8755                            ELSE concat("INFO", ';')
 8756                        END,
 8757                        CASE 
 8758                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8759                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8760                            THEN concat(
 8761                                    '{snpeff_hgvs}=',
 8762                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8763                                )
 8764                            ELSE ''
 8765                        END
 8766                    )
 8767                FROM dataframe_snpeff_hgvs
 8768                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8769
 8770            """
 8771            self.conn.execute(sql_update)
 8772
 8773            # Delete dataframe
 8774            del dataframe_snpeff_hgvs
 8775            gc.collect()
 8776
 8777        else:
 8778
 8779            log.warning(
 8780                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8781            )
 8782
 8783        # Remove added columns
 8784        for added_column in added_columns:
 8785            self.drop_column(column=added_column)
 8786
 8787    def calculation_snpeff_ann_explode(
 8788        self,
 8789        uniquify: bool = True,
 8790        output_format: str = "fields",
 8791        output_prefix: str = "snpeff_",
 8792        snpeff_field: str = "ANN",
 8793    ) -> None:
 8794        """
 8795        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8796        exploding the HGVS field and updating variant information accordingly.
 8797
 8798        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8799        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8800        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8801        defaults to True
 8802        :type uniquify: bool (optional)
 8803        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8804        function specifies the format in which the output annotations will be generated. It has a
 8805        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8806        format, defaults to fields
 8807        :type output_format: str (optional)
 8808        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8809        method is used to specify the prefix that will be added to the output annotations generated
 8810        during the calculation process. This prefix helps to differentiate the newly added annotations
 8811        from existing ones in the output data. By default, the, defaults to ANN_
 8812        :type output_prefix: str (optional)
 8813        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8814        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8815        field will be processed to explode the HGVS annotations and update the variant information
 8816        accordingly, defaults to ANN
 8817        :type snpeff_field: str (optional)
 8818        """
 8819
 8820        # SnpEff annotation field
 8821        snpeff_hgvs = "snpeff_ann_explode"
 8822
 8823        # Snpeff hgvs tags
 8824        vcf_infos_tags = {
 8825            snpeff_hgvs: "Explode snpEff annotations",
 8826        }
 8827
 8828        # Prefix
 8829        prefix = self.get_explode_infos_prefix()
 8830        if prefix:
 8831            prefix = "INFO/"
 8832
 8833        # snpEff fields
 8834        speff_ann_infos = prefix + snpeff_field
 8835        speff_hgvs_infos = prefix + snpeff_hgvs
 8836
 8837        # Variants table
 8838        table_variants = self.get_table_variants()
 8839
 8840        # Header
 8841        vcf_reader = self.get_header()
 8842
 8843        # Add columns
 8844        added_columns = []
 8845
 8846        # Explode HGVS field in column
 8847        added_columns += self.explode_infos(fields=[snpeff_field])
 8848        log.debug(f"snpeff_field={snpeff_field}")
 8849        log.debug(f"added_columns={added_columns}")
 8850
 8851        if snpeff_field in vcf_reader.infos:
 8852
 8853            # Extract ANN header
 8854            ann_description = vcf_reader.infos[snpeff_field].desc
 8855            pattern = r"'(.+?)'"
 8856            match = re.search(pattern, ann_description)
 8857            if match:
 8858                ann_header_match = match.group(1).split(" | ")
 8859                ann_header = []
 8860                ann_header_desc = {}
 8861                for i in range(len(ann_header_match)):
 8862                    ann_header_info = "".join(
 8863                        char for char in ann_header_match[i] if char.isalnum()
 8864                    )
 8865                    ann_header.append(ann_header_info)
 8866                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8867                if not ann_header_desc:
 8868                    raise ValueError("Invalid header description format")
 8869            else:
 8870                raise ValueError("Invalid header description format")
 8871
 8872            # Create variant id
 8873            variant_id_column = self.get_variant_id_column()
 8874            added_columns += [variant_id_column]
 8875
 8876            # Create dataframe
 8877            dataframe_snpeff_hgvs = self.get_query_to_df(
 8878                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8879            )
 8880
 8881            # Create snpEff columns
 8882            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8883                speff_ann_infos
 8884            ].apply(
 8885                lambda x: explode_snpeff_ann(
 8886                    str(x),
 8887                    uniquify=uniquify,
 8888                    output_format=output_format,
 8889                    prefix=output_prefix,
 8890                    header=list(ann_header_desc.values()),
 8891                )
 8892            )
 8893
 8894            # Header
 8895            ann_annotations_prefix = ""
 8896            if output_format.upper() in ["JSON"]:
 8897                ann_annotations_prefix = f"{output_prefix}="
 8898                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8899                    output_prefix,
 8900                    ".",
 8901                    "String",
 8902                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8903                    + " - JSON format",
 8904                    "howard calculation",
 8905                    "0",
 8906                    self.code_type_map.get("String"),
 8907                )
 8908            else:
 8909                for ann_annotation in ann_header:
 8910                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8911                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8912                        ann_annotation_id,
 8913                        ".",
 8914                        "String",
 8915                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8916                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8917                        "howard calculation",
 8918                        "0",
 8919                        self.code_type_map.get("String"),
 8920                    )
 8921
 8922            # Update
 8923            sql_update = f"""
 8924                UPDATE variants
 8925                SET "INFO" = 
 8926                    concat(
 8927                        CASE
 8928                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8929                            THEN ''
 8930                            ELSE concat("INFO", ';')
 8931                        END,
 8932                        CASE 
 8933                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8934                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8935                            THEN concat(
 8936                                '{ann_annotations_prefix}',
 8937                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8938                                )
 8939                            ELSE ''
 8940                        END
 8941                    )
 8942                FROM dataframe_snpeff_hgvs
 8943                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8944
 8945            """
 8946            self.conn.execute(sql_update)
 8947
 8948            # Delete dataframe
 8949            del dataframe_snpeff_hgvs
 8950            gc.collect()
 8951
 8952        else:
 8953
 8954            log.warning(
 8955                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8956            )
 8957
 8958        # Remove added columns
 8959        for added_column in added_columns:
 8960            self.drop_column(column=added_column)
 8961
 8962    def calculation_extract_nomen(self) -> None:
 8963        """
 8964        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8965        """
 8966
 8967        # NOMEN field
 8968        field_nomen_dict = "NOMEN_DICT"
 8969
 8970        # NOMEN structure
 8971        nomen_dict = {
 8972            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8973            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8974            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8975            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8976            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8977            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8978            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8979            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8980            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8981            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8982        }
 8983
 8984        # Param
 8985        param = self.get_param()
 8986
 8987        # Threads
 8988        threads = self.get_threads()
 8989
 8990        # Prefix
 8991        prefix = self.get_explode_infos_prefix()
 8992
 8993        # Header
 8994        vcf_reader = self.get_header()
 8995
 8996        # Added columns
 8997        added_columns = []
 8998
 8999        # Get HGVS field
 9000        hgvs_field = (
 9001            param.get("calculation", {})
 9002            .get("calculations", {})
 9003            .get("NOMEN", {})
 9004            .get("options", {})
 9005            .get("hgvs_field", "hgvs")
 9006        )
 9007
 9008        # Get NOMEN pattern
 9009        nomen_pattern = (
 9010            param.get("calculation", {})
 9011            .get("calculations", {})
 9012            .get("NOMEN", {})
 9013            .get("options", {})
 9014            .get("pattern", None)
 9015        )
 9016
 9017        # transcripts list of preference sources
 9018        transcripts_sources = {}
 9019
 9020        # Get transcripts
 9021        transcripts_file = (
 9022            param.get("calculation", {})
 9023            .get("calculations", {})
 9024            .get("NOMEN", {})
 9025            .get("options", {})
 9026            .get("transcripts", None)
 9027        )
 9028        transcripts_file = full_path(transcripts_file)
 9029        if transcripts_file:
 9030            if os.path.exists(transcripts_file):
 9031                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 9032                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
 9033                transcripts_sources["file"] = transcripts_from_file
 9034            else:
 9035                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
 9036                log.error(msg_err)
 9037                raise ValueError(msg_err)
 9038
 9039        # Get transcripts table
 9040        transcripts_table = (
 9041            param.get("calculation", {})
 9042            .get("calculations", {})
 9043            .get("NOMEN", {})
 9044            .get("options", {})
 9045            .get("transcripts_table", self.get_table_variants())
 9046        )
 9047        # Get transcripts column
 9048        transcripts_column = (
 9049            param.get("calculation", {})
 9050            .get("calculations", {})
 9051            .get("NOMEN", {})
 9052            .get("options", {})
 9053            .get("transcripts_column", None)
 9054        )
 9055
 9056        if transcripts_table and transcripts_column:
 9057            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
 9058            # Explode if not exists
 9059            added_columns += self.explode_infos(
 9060                fields=[transcripts_column], table=transcripts_table
 9061            )
 9062        else:
 9063            extra_field_transcript = f"NULL"
 9064
 9065        # Transcripts of preference source order
 9066        transcripts_order = (
 9067            param.get("calculation", {})
 9068            .get("calculations", {})
 9069            .get("NOMEN", {})
 9070            .get("options", {})
 9071            .get("transcripts_order", ["column", "file"])
 9072        )
 9073
 9074        # Transcripts from file
 9075        transcripts = transcripts_sources.get("file", [])
 9076
 9077        # Explode HGVS field in column
 9078        added_columns += self.explode_infos(fields=[hgvs_field])
 9079
 9080        # extra infos
 9081        extra_infos = self.get_extra_infos()
 9082        extra_field = prefix + hgvs_field
 9083
 9084        if extra_field in extra_infos:
 9085
 9086            # Create dataframe
 9087            dataframe_hgvs = self.get_query_to_df(
 9088                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
 9089            )
 9090
 9091            # Transcripts rank
 9092            transcripts_rank = {
 9093                transcript: rank for rank, transcript in enumerate(transcripts, start=1)
 9094            }
 9095            transcripts_len = len(transcripts_rank)
 9096
 9097            # Create main NOMEN column
 9098            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
 9099                lambda x: find_nomen(
 9100                    hgvs=x.hgvs,
 9101                    transcript=x.transcript,
 9102                    transcripts=transcripts_rank,
 9103                    pattern=nomen_pattern,
 9104                    transcripts_source_order=transcripts_order,
 9105                    transcripts_len=transcripts_len,
 9106                ),
 9107                axis=1,
 9108            )
 9109
 9110            # Explode NOMEN Structure and create SQL set for update
 9111            sql_nomen_fields = []
 9112            for nomen_field in nomen_dict:
 9113
 9114                # Create VCF header field
 9115                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 9116                    nomen_field,
 9117                    ".",
 9118                    "String",
 9119                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 9120                    "howard calculation",
 9121                    "0",
 9122                    self.code_type_map.get("String"),
 9123                )
 9124
 9125                # Add field to SQL query update
 9126                sql_nomen_fields.append(
 9127                    f"""
 9128                        CASE 
 9129                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
 9130                            THEN concat(
 9131                                    ';{nomen_field}=',
 9132                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
 9133                                )
 9134                            ELSE ''
 9135                        END
 9136                    """
 9137                )
 9138
 9139            # SQL set for update
 9140            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 9141
 9142            # Update
 9143            sql_update = f"""
 9144                UPDATE variants
 9145                SET "INFO" = 
 9146                    concat(
 9147                        CASE
 9148                            WHEN "INFO" IS NULL
 9149                            THEN ''
 9150                            ELSE "INFO"
 9151                        END,
 9152                        {sql_nomen_fields_set}
 9153                    )
 9154                FROM dataframe_hgvs
 9155                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 9156                    AND variants."POS" = dataframe_hgvs."POS" 
 9157                    AND variants."REF" = dataframe_hgvs."REF"
 9158                    AND variants."ALT" = dataframe_hgvs."ALT"
 9159            """
 9160            self.conn.execute(sql_update)
 9161
 9162            # Delete dataframe
 9163            del dataframe_hgvs
 9164            gc.collect()
 9165
 9166        # Remove added columns
 9167        for added_column in added_columns:
 9168            self.drop_column(column=added_column)
 9169
 9170    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 9171        """
 9172        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 9173        pipeline/sample for a variant and updates the variant information in a VCF file.
 9174
 9175        :param tag: The `tag` parameter is a string that represents the annotation field for the
 9176        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 9177        VCF header and to update the corresponding field in the variants table, defaults to
 9178        findbypipeline
 9179        :type tag: str (optional)
 9180        """
 9181
 9182        # if FORMAT and samples
 9183        if (
 9184            "FORMAT" in self.get_header_columns_as_list()
 9185            and self.get_header_sample_list()
 9186        ):
 9187
 9188            # findbypipeline annotation field
 9189            findbypipeline_tag = tag
 9190
 9191            # VCF infos tags
 9192            vcf_infos_tags = {
 9193                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 9194            }
 9195
 9196            # Prefix
 9197            prefix = self.get_explode_infos_prefix()
 9198
 9199            # Field
 9200            findbypipeline_infos = prefix + findbypipeline_tag
 9201
 9202            # Variants table
 9203            table_variants = self.get_table_variants()
 9204
 9205            # Header
 9206            vcf_reader = self.get_header()
 9207
 9208            # Create variant id
 9209            variant_id_column = self.get_variant_id_column()
 9210            added_columns = [variant_id_column]
 9211
 9212            # variant_id, FORMAT and samples
 9213            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9214                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9215            )
 9216
 9217            # Create dataframe
 9218            dataframe_findbypipeline = self.get_query_to_df(
 9219                f""" SELECT {samples_fields} FROM {table_variants} """
 9220            )
 9221
 9222            # Create findbypipeline column
 9223            dataframe_findbypipeline[findbypipeline_infos] = (
 9224                dataframe_findbypipeline.apply(
 9225                    lambda row: findbypipeline(
 9226                        row, samples=self.get_header_sample_list()
 9227                    ),
 9228                    axis=1,
 9229                )
 9230            )
 9231
 9232            # Add snpeff_hgvs to header
 9233            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 9234                findbypipeline_tag,
 9235                ".",
 9236                "String",
 9237                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 9238                "howard calculation",
 9239                "0",
 9240                self.code_type_map.get("String"),
 9241            )
 9242
 9243            # Update
 9244            sql_update = f"""
 9245                UPDATE variants
 9246                SET "INFO" = 
 9247                    concat(
 9248                        CASE
 9249                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9250                            THEN ''
 9251                            ELSE concat("INFO", ';')
 9252                        END,
 9253                        CASE 
 9254                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 9255                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 9256                            THEN concat(
 9257                                    '{findbypipeline_tag}=',
 9258                                    dataframe_findbypipeline."{findbypipeline_infos}"
 9259                                )
 9260                            ELSE ''
 9261                        END
 9262                    )
 9263                FROM dataframe_findbypipeline
 9264                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 9265            """
 9266            self.conn.execute(sql_update)
 9267
 9268            # Remove added columns
 9269            for added_column in added_columns:
 9270                self.drop_column(column=added_column)
 9271
 9272            # Delete dataframe
 9273            del dataframe_findbypipeline
 9274            gc.collect()
 9275
 9276    def calculation_genotype_concordance(self) -> None:
 9277        """
 9278        The function `calculation_genotype_concordance` calculates the genotype concordance for
 9279        multi-caller VCF files and updates the variant information in the database.
 9280        """
 9281
 9282        # if FORMAT and samples
 9283        if (
 9284            "FORMAT" in self.get_header_columns_as_list()
 9285            and self.get_header_sample_list()
 9286        ):
 9287
 9288            # genotypeconcordance annotation field
 9289            genotypeconcordance_tag = "genotypeconcordance"
 9290
 9291            # VCF infos tags
 9292            vcf_infos_tags = {
 9293                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 9294            }
 9295
 9296            # Prefix
 9297            prefix = self.get_explode_infos_prefix()
 9298
 9299            # Field
 9300            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 9301
 9302            # Variants table
 9303            table_variants = self.get_table_variants()
 9304
 9305            # Header
 9306            vcf_reader = self.get_header()
 9307
 9308            # Create variant id
 9309            variant_id_column = self.get_variant_id_column()
 9310            added_columns = [variant_id_column]
 9311
 9312            # variant_id, FORMAT and samples
 9313            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9314                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9315            )
 9316
 9317            # Create dataframe
 9318            dataframe_genotypeconcordance = self.get_query_to_df(
 9319                f""" SELECT {samples_fields} FROM {table_variants} """
 9320            )
 9321
 9322            # Create genotypeconcordance column
 9323            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 9324                dataframe_genotypeconcordance.apply(
 9325                    lambda row: genotypeconcordance(
 9326                        row, samples=self.get_header_sample_list()
 9327                    ),
 9328                    axis=1,
 9329                )
 9330            )
 9331
 9332            # Add genotypeconcordance to header
 9333            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 9334                genotypeconcordance_tag,
 9335                ".",
 9336                "String",
 9337                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 9338                "howard calculation",
 9339                "0",
 9340                self.code_type_map.get("String"),
 9341            )
 9342
 9343            # Update
 9344            sql_update = f"""
 9345                UPDATE variants
 9346                SET "INFO" = 
 9347                    concat(
 9348                        CASE
 9349                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9350                            THEN ''
 9351                            ELSE concat("INFO", ';')
 9352                        END,
 9353                        CASE
 9354                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 9355                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 9356                            THEN concat(
 9357                                    '{genotypeconcordance_tag}=',
 9358                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 9359                                )
 9360                            ELSE ''
 9361                        END
 9362                    )
 9363                FROM dataframe_genotypeconcordance
 9364                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 9365            """
 9366            self.conn.execute(sql_update)
 9367
 9368            # Remove added columns
 9369            for added_column in added_columns:
 9370                self.drop_column(column=added_column)
 9371
 9372            # Delete dataframe
 9373            del dataframe_genotypeconcordance
 9374            gc.collect()
 9375
 9376    def calculation_barcode(self, tag: str = "barcode") -> None:
 9377        """
 9378        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 9379        updates the INFO field in the file with the calculated barcode values.
 9380
 9381        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 9382        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 9383        the default tag name is set to "barcode", defaults to barcode
 9384        :type tag: str (optional)
 9385        """
 9386
 9387        # if FORMAT and samples
 9388        if (
 9389            "FORMAT" in self.get_header_columns_as_list()
 9390            and self.get_header_sample_list()
 9391        ):
 9392
 9393            # barcode annotation field
 9394            if not tag:
 9395                tag = "barcode"
 9396
 9397            # VCF infos tags
 9398            vcf_infos_tags = {
 9399                tag: "barcode calculation (VaRank)",
 9400            }
 9401
 9402            # Prefix
 9403            prefix = self.get_explode_infos_prefix()
 9404
 9405            # Field
 9406            barcode_infos = prefix + tag
 9407
 9408            # Variants table
 9409            table_variants = self.get_table_variants()
 9410
 9411            # Header
 9412            vcf_reader = self.get_header()
 9413
 9414            # Create variant id
 9415            variant_id_column = self.get_variant_id_column()
 9416            added_columns = [variant_id_column]
 9417
 9418            # variant_id, FORMAT and samples
 9419            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9420                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9421            )
 9422
 9423            # Create dataframe
 9424            dataframe_barcode = self.get_query_to_df(
 9425                f""" SELECT {samples_fields} FROM {table_variants} """
 9426            )
 9427
 9428            # Create barcode column
 9429            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9430                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 9431            )
 9432
 9433            # Add barcode to header
 9434            vcf_reader.infos[tag] = vcf.parser._Info(
 9435                tag,
 9436                ".",
 9437                "String",
 9438                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 9439                "howard calculation",
 9440                "0",
 9441                self.code_type_map.get("String"),
 9442            )
 9443
 9444            # Update
 9445            sql_update = f"""
 9446                UPDATE {table_variants}
 9447                SET "INFO" = 
 9448                    concat(
 9449                        CASE
 9450                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9451                            THEN ''
 9452                            ELSE concat("INFO", ';')
 9453                        END,
 9454                        CASE
 9455                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 9456                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 9457                            THEN concat(
 9458                                    '{tag}=',
 9459                                    dataframe_barcode."{barcode_infos}"
 9460                                )
 9461                            ELSE ''
 9462                        END
 9463                    )
 9464                FROM dataframe_barcode
 9465                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9466            """
 9467            self.conn.execute(sql_update)
 9468
 9469            # Remove added columns
 9470            for added_column in added_columns:
 9471                self.drop_column(column=added_column)
 9472
 9473            # Delete dataframe
 9474            del dataframe_barcode
 9475            gc.collect()
 9476
 9477    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 9478        """
 9479        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 9480        and updates the INFO field in the file with the calculated barcode values.
 9481
 9482        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 9483        the barcode tag that will be added to the VCF file during the calculation process. If no value
 9484        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 9485        :type tag: str (optional)
 9486        """
 9487
 9488        # if FORMAT and samples
 9489        if (
 9490            "FORMAT" in self.get_header_columns_as_list()
 9491            and self.get_header_sample_list()
 9492        ):
 9493
 9494            # barcode annotation field
 9495            if not tag:
 9496                tag = "BCF"
 9497
 9498            # VCF infos tags
 9499            vcf_infos_tags = {
 9500                tag: "barcode family calculation",
 9501                f"{tag}S": "barcode family samples",
 9502            }
 9503
 9504            # Param
 9505            param = self.get_param()
 9506            log.debug(f"param={param}")
 9507
 9508            # Prefix
 9509            prefix = self.get_explode_infos_prefix()
 9510
 9511            # PED param
 9512            ped = (
 9513                param.get("calculation", {})
 9514                .get("calculations", {})
 9515                .get("BARCODEFAMILY", {})
 9516                .get("family_pedigree", None)
 9517            )
 9518            log.debug(f"ped={ped}")
 9519
 9520            # Load PED
 9521            if ped:
 9522
 9523                # Pedigree is a file
 9524                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 9525                    log.debug("Pedigree is file")
 9526                    with open(full_path(ped)) as ped:
 9527                        ped = yaml.safe_load(ped)
 9528
 9529                # Pedigree is a string
 9530                elif isinstance(ped, str):
 9531                    log.debug("Pedigree is str")
 9532                    try:
 9533                        ped = json.loads(ped)
 9534                        log.debug("Pedigree is json str")
 9535                    except ValueError as e:
 9536                        ped_samples = ped.split(",")
 9537                        ped = {}
 9538                        for ped_sample in ped_samples:
 9539                            ped[ped_sample] = ped_sample
 9540
 9541                # Pedigree is a dict
 9542                elif isinstance(ped, dict):
 9543                    log.debug("Pedigree is dict")
 9544
 9545                # Pedigree is not well formatted
 9546                else:
 9547                    msg_error = "Pedigree not well formatted"
 9548                    log.error(msg_error)
 9549                    raise ValueError(msg_error)
 9550
 9551                # Construct list
 9552                ped_samples = list(ped.values())
 9553
 9554            else:
 9555                log.debug("Pedigree not defined. Take all samples")
 9556                ped_samples = self.get_header_sample_list()
 9557                ped = {}
 9558                for ped_sample in ped_samples:
 9559                    ped[ped_sample] = ped_sample
 9560
 9561            # Check pedigree
 9562            if not ped or len(ped) == 0:
 9563                msg_error = f"Error in pedigree: samples {ped_samples}"
 9564                log.error(msg_error)
 9565                raise ValueError(msg_error)
 9566
 9567            # Log
 9568            log.info(
 9569                "Calculation 'BARCODEFAMILY' - Samples: "
 9570                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9571            )
 9572            log.debug(f"ped_samples={ped_samples}")
 9573
 9574            # Field
 9575            barcode_infos = prefix + tag
 9576
 9577            # Variants table
 9578            table_variants = self.get_table_variants()
 9579
 9580            # Header
 9581            vcf_reader = self.get_header()
 9582
 9583            # Create variant id
 9584            variant_id_column = self.get_variant_id_column()
 9585            added_columns = [variant_id_column]
 9586
 9587            # variant_id, FORMAT and samples
 9588            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9589                [f""" "{sample}" """ for sample in ped_samples]
 9590            )
 9591
 9592            # Create dataframe
 9593            dataframe_barcode = self.get_query_to_df(
 9594                f""" SELECT {samples_fields} FROM {table_variants} """
 9595            )
 9596
 9597            # Create barcode column
 9598            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9599                lambda row: barcode(row, samples=ped_samples), axis=1
 9600            )
 9601
 9602            # Add barcode family to header
 9603            # Add vaf_normalization to header
 9604            vcf_reader.formats[tag] = vcf.parser._Format(
 9605                id=tag,
 9606                num=".",
 9607                type="String",
 9608                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9609                type_code=self.code_type_map.get("String"),
 9610            )
 9611            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9612                id=f"{tag}S",
 9613                num=".",
 9614                type="String",
 9615                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9616                type_code=self.code_type_map.get("String"),
 9617            )
 9618
 9619            # Update
 9620            # for sample in ped_samples:
 9621            sql_update_set = []
 9622            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9623                if sample in ped_samples:
 9624                    value = f'dataframe_barcode."{barcode_infos}"'
 9625                    value_samples = (
 9626                        "'"
 9627                        + ",".join([f""" "{sample}" """ for sample in ped_samples])
 9628                        + "'"
 9629                    )
 9630                    ped_samples
 9631                elif sample == "FORMAT":
 9632                    value = f"'{tag}'"
 9633                    value_samples = f"'{tag}S'"
 9634                else:
 9635                    value = "'.'"
 9636                    value_samples = "'.'"
 9637                format_regex = r"[a-zA-Z0-9\s]"
 9638                sql_update_set.append(
 9639                    f"""
 9640                        "{sample}" = 
 9641                        concat(
 9642                            CASE
 9643                                WHEN {table_variants}."{sample}" = './.'
 9644                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9645                                ELSE {table_variants}."{sample}"
 9646                            END,
 9647                            ':',
 9648                            {value},
 9649                            ':',
 9650                            {value_samples}
 9651                        )
 9652                    """
 9653                )
 9654
 9655            sql_update_set_join = ", ".join(sql_update_set)
 9656            sql_update = f"""
 9657                UPDATE {table_variants}
 9658                SET {sql_update_set_join}
 9659                FROM dataframe_barcode
 9660                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9661            """
 9662            self.conn.execute(sql_update)
 9663
 9664            # Remove added columns
 9665            for added_column in added_columns:
 9666                self.drop_column(column=added_column)
 9667
 9668            # Delete dataframe
 9669            del dataframe_barcode
 9670            gc.collect()
 9671
 9672    def calculation_trio(self) -> None:
 9673        """
 9674        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9675        information to the INFO field of each variant.
 9676        """
 9677
 9678        # if FORMAT and samples
 9679        if (
 9680            "FORMAT" in self.get_header_columns_as_list()
 9681            and self.get_header_sample_list()
 9682        ):
 9683
 9684            # trio annotation field
 9685            trio_tag = "trio"
 9686
 9687            # VCF infos tags
 9688            vcf_infos_tags = {
 9689                "trio": "trio calculation",
 9690            }
 9691
 9692            # Param
 9693            param = self.get_param()
 9694
 9695            # Prefix
 9696            prefix = self.get_explode_infos_prefix()
 9697
 9698            # Trio param
 9699            trio_ped = (
 9700                param.get("calculation", {})
 9701                .get("calculations", {})
 9702                .get("TRIO", {})
 9703                .get("trio_pedigree", None)
 9704            )
 9705
 9706            # Load trio
 9707            if trio_ped:
 9708
 9709                # Trio pedigree is a file
 9710                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9711                    log.debug("TRIO pedigree is file")
 9712                    with open(full_path(trio_ped)) as trio_ped:
 9713                        trio_ped = yaml.safe_load(trio_ped)
 9714
 9715                # Trio pedigree is a string
 9716                elif isinstance(trio_ped, str):
 9717                    log.debug("TRIO pedigree is str")
 9718                    try:
 9719                        trio_ped = json.loads(trio_ped)
 9720                        log.debug("TRIO pedigree is json str")
 9721                    except ValueError as e:
 9722                        trio_samples = trio_ped.split(",")
 9723                        if len(trio_samples) == 3:
 9724                            trio_ped = {
 9725                                "father": trio_samples[0],
 9726                                "mother": trio_samples[1],
 9727                                "child": trio_samples[2],
 9728                            }
 9729                            log.debug("TRIO pedigree is list str")
 9730                        else:
 9731                            msg_error = "TRIO pedigree not well formatted"
 9732                            log.error(msg_error)
 9733                            raise ValueError(msg_error)
 9734
 9735                # Trio pedigree is a dict
 9736                elif isinstance(trio_ped, dict):
 9737                    log.debug("TRIO pedigree is dict")
 9738
 9739                # Trio pedigree is not well formatted
 9740                else:
 9741                    msg_error = "TRIO pedigree not well formatted"
 9742                    log.error(msg_error)
 9743                    raise ValueError(msg_error)
 9744
 9745                # Construct trio list
 9746                trio_samples = [
 9747                    trio_ped.get("father", ""),
 9748                    trio_ped.get("mother", ""),
 9749                    trio_ped.get("child", ""),
 9750                ]
 9751
 9752            else:
 9753                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9754                samples_list = self.get_header_sample_list()
 9755                if len(samples_list) >= 3:
 9756                    trio_samples = self.get_header_sample_list()[0:3]
 9757                    trio_ped = {
 9758                        "father": trio_samples[0],
 9759                        "mother": trio_samples[1],
 9760                        "child": trio_samples[2],
 9761                    }
 9762                else:
 9763                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9764                    log.error(msg_error)
 9765                    raise ValueError(msg_error)
 9766
 9767            # Check trio pedigree
 9768            if not trio_ped or len(trio_ped) != 3:
 9769                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9770                log.error(msg_error)
 9771                raise ValueError(msg_error)
 9772
 9773            # Log
 9774            log.info(
 9775                f"Calculation 'TRIO' - Samples: "
 9776                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9777            )
 9778
 9779            # Field
 9780            trio_infos = prefix + trio_tag
 9781
 9782            # Variants table
 9783            table_variants = self.get_table_variants()
 9784
 9785            # Header
 9786            vcf_reader = self.get_header()
 9787
 9788            # Create variant id
 9789            variant_id_column = self.get_variant_id_column()
 9790            added_columns = [variant_id_column]
 9791
 9792            # variant_id, FORMAT and samples
 9793            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9794                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
 9795            )
 9796
 9797            # Create dataframe
 9798            dataframe_trio = self.get_query_to_df(
 9799                f""" SELECT {samples_fields} FROM {table_variants} """
 9800            )
 9801
 9802            # Create trio column
 9803            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9804                lambda row: trio(row, samples=trio_samples), axis=1
 9805            )
 9806
 9807            # Add trio to header
 9808            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9809                trio_tag,
 9810                ".",
 9811                "String",
 9812                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9813                "howard calculation",
 9814                "0",
 9815                self.code_type_map.get("String"),
 9816            )
 9817
 9818            # Update
 9819            sql_update = f"""
 9820                UPDATE {table_variants}
 9821                SET "INFO" = 
 9822                    concat(
 9823                        CASE
 9824                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9825                            THEN ''
 9826                            ELSE concat("INFO", ';')
 9827                        END,
 9828                        CASE
 9829                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9830                             AND dataframe_trio."{trio_infos}" NOT NULL
 9831                            THEN concat(
 9832                                    '{trio_tag}=',
 9833                                    dataframe_trio."{trio_infos}"
 9834                                )
 9835                            ELSE ''
 9836                        END
 9837                    )
 9838                FROM dataframe_trio
 9839                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9840            """
 9841            self.conn.execute(sql_update)
 9842
 9843            # Remove added columns
 9844            for added_column in added_columns:
 9845                self.drop_column(column=added_column)
 9846
 9847            # Delete dataframe
 9848            del dataframe_trio
 9849            gc.collect()
 9850
 9851    def calculation_vaf_normalization(self) -> None:
 9852        """
 9853        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9854        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9855        :return: The function does not return anything.
 9856        """
 9857
 9858        # if FORMAT and samples
 9859        if (
 9860            "FORMAT" in self.get_header_columns_as_list()
 9861            and self.get_header_sample_list()
 9862        ):
 9863
 9864            # vaf_normalization annotation field
 9865            vaf_normalization_tag = "VAF"
 9866
 9867            # VCF infos tags
 9868            vcf_infos_tags = {
 9869                "VAF": "VAF Variant Frequency",
 9870            }
 9871
 9872            # Prefix
 9873            prefix = self.get_explode_infos_prefix()
 9874
 9875            # Variants table
 9876            table_variants = self.get_table_variants()
 9877
 9878            # Header
 9879            vcf_reader = self.get_header()
 9880
 9881            # Do not calculate if VAF already exists
 9882            if "VAF" in vcf_reader.formats:
 9883                log.debug("VAF already on genotypes")
 9884                return
 9885
 9886            # Create variant id
 9887            variant_id_column = self.get_variant_id_column()
 9888            added_columns = [variant_id_column]
 9889
 9890            # variant_id, FORMAT and samples
 9891            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9892                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9893            )
 9894
 9895            # Create dataframe
 9896            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9897            log.debug(f"query={query}")
 9898            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9899
 9900            vaf_normalization_set = []
 9901
 9902            # for each sample vaf_normalization
 9903            for sample in self.get_header_sample_list():
 9904                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9905                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9906                )
 9907                vaf_normalization_set.append(
 9908                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9909                )
 9910
 9911            # Add VAF to FORMAT
 9912            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9913                "FORMAT"
 9914            ].apply(lambda x: str(x) + ":VAF")
 9915            vaf_normalization_set.append(
 9916                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9917            )
 9918
 9919            # Add vaf_normalization to header
 9920            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9921                id=vaf_normalization_tag,
 9922                num="1",
 9923                type="Float",
 9924                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9925                type_code=self.code_type_map.get("Float"),
 9926            )
 9927
 9928            # Create fields to add in INFO
 9929            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9930
 9931            # Update
 9932            sql_update = f"""
 9933                UPDATE {table_variants}
 9934                SET {sql_vaf_normalization_set}
 9935                FROM dataframe_vaf_normalization
 9936                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9937
 9938            """
 9939            self.conn.execute(sql_update)
 9940
 9941            # Remove added columns
 9942            for added_column in added_columns:
 9943                self.drop_column(column=added_column)
 9944
 9945            # Delete dataframe
 9946            del dataframe_vaf_normalization
 9947            gc.collect()
 9948
 9949    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9950        """
 9951        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9952        field in a VCF file and updates the INFO column of the variants table with the calculated
 9953        statistics.
 9954
 9955        :param info: The `info` parameter is a string that represents the type of information for which
 9956        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9957        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9958        maximum value, the mean, the median, defaults to VAF
 9959        :type info: str (optional)
 9960        """
 9961
 9962        # if FORMAT and samples
 9963        if (
 9964            "FORMAT" in self.get_header_columns_as_list()
 9965            and self.get_header_sample_list()
 9966        ):
 9967
 9968            # vaf_stats annotation field
 9969            vaf_stats_tag = info + "_stats"
 9970
 9971            # VCF infos tags
 9972            vcf_infos_tags = {
 9973                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9974                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9975                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9976                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9977                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9978                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9979                info
 9980                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9981            }
 9982
 9983            # Prefix
 9984            prefix = self.get_explode_infos_prefix()
 9985
 9986            # Field
 9987            vaf_stats_infos = prefix + vaf_stats_tag
 9988
 9989            # Variants table
 9990            table_variants = self.get_table_variants()
 9991
 9992            # Header
 9993            vcf_reader = self.get_header()
 9994
 9995            # Create variant id
 9996            variant_id_column = self.get_variant_id_column()
 9997            added_columns = [variant_id_column]
 9998
 9999            # variant_id, FORMAT and samples
10000            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
10001                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
10002            )
10003
10004            # Create dataframe
10005            dataframe_vaf_stats = self.get_query_to_df(
10006                f""" SELECT {samples_fields} FROM {table_variants} """
10007            )
10008
10009            # Create vaf_stats column
10010            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
10011                lambda row: genotype_stats(
10012                    row, samples=self.get_header_sample_list(), info=info
10013                ),
10014                axis=1,
10015            )
10016
10017            # List of vcf tags
10018            sql_vaf_stats_fields = []
10019
10020            # Check all VAF stats infos
10021            for stat in vcf_infos_tags:
10022
10023                # Extract stats
10024                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
10025                    lambda x: dict(x).get(stat, "")
10026                )
10027
10028                # Add snpeff_hgvs to header
10029                vcf_reader.infos[stat] = vcf.parser._Info(
10030                    stat,
10031                    ".",
10032                    "String",
10033                    vcf_infos_tags.get(stat, "genotype statistics"),
10034                    "howard calculation",
10035                    "0",
10036                    self.code_type_map.get("String"),
10037                )
10038
10039                if len(sql_vaf_stats_fields):
10040                    sep = ";"
10041                else:
10042                    sep = ""
10043
10044                # Create fields to add in INFO
10045                sql_vaf_stats_fields.append(
10046                    f"""
10047                        CASE
10048                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
10049                            THEN concat(
10050                                    '{sep}{stat}=',
10051                                    dataframe_vaf_stats."{stat}"
10052                                )
10053                            ELSE ''
10054                        END
10055                    """
10056                )
10057
10058            # SQL set for update
10059            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
10060
10061            # Update
10062            sql_update = f"""
10063                UPDATE {table_variants}
10064                SET "INFO" = 
10065                    concat(
10066                        CASE
10067                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10068                            THEN ''
10069                            ELSE concat("INFO", ';')
10070                        END,
10071                        {sql_vaf_stats_fields_set}
10072                    )
10073                FROM dataframe_vaf_stats
10074                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
10075
10076            """
10077            self.conn.execute(sql_update)
10078
10079            # Remove added columns
10080            for added_column in added_columns:
10081                self.drop_column(column=added_column)
10082
10083            # Delete dataframe
10084            del dataframe_vaf_stats
10085            gc.collect()
10086
10087    def calculation_transcripts_annotation(
10088        self, info_json: str = None, info_format: str = None
10089    ) -> None:
10090        """
10091        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10092        field to it if transcripts are available.
10093
10094        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10095        is a string parameter that represents the information field to be used in the transcripts JSON.
10096        It is used to specify the JSON format for the transcripts information. If no value is provided
10097        when calling the method, it defaults to "
10098        :type info_json: str
10099        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10100        method is a string parameter that specifies the format of the information field to be used in
10101        the transcripts JSON. It is used to define the format of the information field
10102        :type info_format: str
10103        """
10104
10105        # Create transcripts table
10106        transcripts_table = self.create_transcript_view()
10107
10108        # Add info field
10109        if transcripts_table:
10110            self.transcript_view_to_variants(
10111                transcripts_table=transcripts_table,
10112                transcripts_info_field_json=info_json,
10113                transcripts_info_field_format=info_format,
10114            )
10115        else:
10116            log.info("No Transcripts to process. Check param.json file configuration")
10117
10118    def calculation_transcripts_prioritization(self) -> None:
10119        """
10120        The function `calculation_transcripts_prioritization` creates a transcripts table and
10121        prioritizes transcripts based on certain criteria.
10122        """
10123
10124        # Create transcripts table
10125        transcripts_table = self.create_transcript_view()
10126
10127        # Add info field
10128        if transcripts_table:
10129            self.transcripts_prioritization(transcripts_table=transcripts_table)
10130        else:
10131            log.info("No Transcripts to process. Check param.json file configuration")
10132
10133    def calculation_transcripts_export(self) -> None:
10134        """ """
10135
10136        # Create transcripts table
10137        transcripts_table = self.create_transcript_view()
10138
10139        # Add info field
10140        if transcripts_table:
10141            self.transcripts_export(transcripts_table=transcripts_table)
10142        else:
10143            log.info("No Transcripts to process. Check param.json file configuration")
10144
10145    ###############
10146    # Transcripts #
10147    ###############
10148
10149    def transcripts_export(
10150        self, transcripts_table: str = None, param: dict = {}
10151    ) -> bool:
10152        """ """
10153
10154        log.debug("Start transcripts export...")
10155
10156        # Param
10157        if not param:
10158            param = self.get_param()
10159
10160        # Param export
10161        param_transcript_export = param.get("transcripts", {}).get("export", {})
10162
10163        # Output file
10164        transcripts_export_output = param_transcript_export.get("output", None)
10165
10166        if not param_transcript_export or not transcripts_export_output:
10167            log.warning(f"No transcriipts export parameters defined!")
10168            return False
10169
10170        # List of transcripts annotations
10171        query_describe = f"""
10172            SELECT column_name
10173            FROM (
10174                    DESCRIBE SELECT * FROM {transcripts_table}
10175                )
10176            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10177        """
10178        transcripts_annotations_list = list(
10179            self.get_query_to_df(query=query_describe)["column_name"]
10180        )
10181
10182        # Create transcripts table for export
10183        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10184            random.choices(string.ascii_uppercase + string.digits, k=10)
10185        )
10186        query_create_transcripts_table_export = f"""
10187            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10188        """
10189        self.execute_query(query=query_create_transcripts_table_export)
10190
10191        # Output file format
10192        transcripts_export_output_format = get_file_format(
10193            filename=transcripts_export_output
10194        )
10195
10196        # Format VCF - construct INFO
10197        if transcripts_export_output_format in ["vcf"]:
10198
10199            # Construct query update INFO and header
10200            query_update_info = []
10201            for field in transcripts_annotations_list:
10202
10203                # If field not in header
10204                if field not in self.get_header_infos_list():
10205
10206                    # Add PZ Transcript in header
10207                    self.get_header().infos[field] = vcf.parser._Info(
10208                        field,
10209                        ".",
10210                        "String",
10211                        f"Annotation '{field}' from transcript view",
10212                        "unknown",
10213                        "unknown",
10214                        0,
10215                    )
10216
10217                # Add field as INFO/tag
10218                query_update_info.append(
10219                    f"""
10220                        CASE
10221                            WHEN "{field}" IS NOT NULL
10222                            THEN concat('{field}=', "{field}", ';')    
10223                            ELSE ''     
10224                        END
10225                        """
10226                )
10227
10228            # Query param
10229            query_update_info_value = (
10230                f""" concat('',  {", ".join(query_update_info)}) """
10231            )
10232            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10233
10234        else:
10235
10236            # Query param
10237            query_update_info_value = f""" NULL """
10238            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10239
10240        # Update query INFO column
10241        query_update = f"""
10242            UPDATE {transcripts_table_export}
10243            SET INFO = {query_update_info_value}
10244
10245        """
10246        self.execute_query(query=query_update)
10247
10248        # Export
10249        self.export_output(
10250            output_file=transcripts_export_output,
10251            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10252        )
10253
10254        # Drop transcripts export table
10255        query_drop_transcripts_table_export = f"""
10256            DROP TABLE {transcripts_table_export}
10257        """
10258        self.execute_query(query=query_drop_transcripts_table_export)
10259
10260    def transcripts_prioritization(
10261        self, transcripts_table: str = None, param: dict = {}
10262    ) -> bool:
10263        """
10264        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10265        and updates the variants table with the prioritized information.
10266
10267        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10268        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10269        This parameter is used to identify the table where the transcripts data is stored for the
10270        prioritization process
10271        :type transcripts_table: str
10272        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10273        that contains various configuration settings for the prioritization process of transcripts. It
10274        is used to customize the behavior of the prioritization algorithm and includes settings such as
10275        the prefix for prioritization fields, default profiles, and other
10276        :type param: dict
10277        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10278        transcripts prioritization process is successfully completed, and `False` if there are any
10279        issues or if no profile is defined for transcripts prioritization.
10280        """
10281
10282        log.debug("Start transcripts prioritization...")
10283
10284        # Param
10285        if not param:
10286            param = self.get_param()
10287
10288        # Variants table
10289        table_variants = self.get_table_variants()
10290
10291        # Transcripts table
10292        if transcripts_table is None:
10293            transcripts_table = self.create_transcript_view(
10294                transcripts_table="transcripts", param=param
10295            )
10296        if transcripts_table is None:
10297            msg_err = "No Transcripts table availalble"
10298            log.error(msg_err)
10299            raise ValueError(msg_err)
10300        log.debug(f"transcripts_table={transcripts_table}")
10301
10302        # Get transcripts columns
10303        columns_as_list_query = f"""
10304            DESCRIBE {transcripts_table}
10305        """
10306        columns_as_list = list(
10307            self.get_query_to_df(columns_as_list_query)["column_name"]
10308        )
10309
10310        # Create INFO if not exists
10311        if "INFO" not in columns_as_list:
10312            query_add_info = f"""
10313                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10314            """
10315            self.execute_query(query_add_info)
10316
10317        # Prioritization param and Force only PZ Score and Flag
10318        pz_param = param.get("transcripts", {}).get("prioritization", {})
10319
10320        # PZ profile by default
10321        pz_profile_default = (
10322            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10323        )
10324
10325        # Exit if no profile
10326        if pz_profile_default is None:
10327            log.warning("No profile defined for transcripts prioritization")
10328            return False
10329
10330        # PZ fields
10331        pz_param_pzfields = {}
10332
10333        # PZ field transcripts
10334        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10335
10336        # Add PZ Transcript in header
10337        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10338            pz_fields_transcripts,
10339            ".",
10340            "String",
10341            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10342            "unknown",
10343            "unknown",
10344            code_type_map["String"],
10345        )
10346
10347        # Mandatory fields if asked in param
10348        pz_mandatory_fields_list = [
10349            "Score",
10350            "Flag",
10351            "Tags",
10352            "Comment",
10353            "Infos",
10354            "Class",
10355        ]
10356        pz_mandatory_fields = []
10357        for pz_mandatory_field in pz_mandatory_fields_list:
10358            pz_mandatory_fields.append(
10359                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10360            )
10361
10362        # PZ fields in param
10363        pz_param_mandatory_fields = []
10364        for pz_field in pz_param.get("pzfields", []):
10365            if pz_field in pz_mandatory_fields_list:
10366                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10367                    pz_param.get("pzprefix", "PTZ") + pz_field
10368                )
10369                pz_param_mandatory_fields.append(
10370                    pz_param.get("pzprefix", "PTZ") + pz_field
10371                )
10372            else:
10373                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10374                pz_param_pzfields[pz_field] = pz_field_new
10375
10376                # Add PZ Transcript in header
10377                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10378                    pz_field_new,
10379                    ".",
10380                    "String",
10381                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10382                    "unknown",
10383                    "unknown",
10384                    code_type_map["String"],
10385                )
10386
10387        # PZ fields param
10388        pz_mandatory_fields = pz_param_mandatory_fields
10389        pz_param["pzfields"] = pz_mandatory_fields
10390
10391        # Prioritization
10392        prioritization_result = self.prioritization(
10393            table=transcripts_table,
10394            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10395        )
10396        if not prioritization_result:
10397            log.warning("Transcripts prioritization not processed")
10398            return False
10399
10400        # PZ fields sql query
10401        query_update_select_list = []
10402        query_update_concat_list = []
10403        query_update_order_list = []
10404        for pz_param_pzfield in set(
10405            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10406        ):
10407            query_update_select_list.append(f" {pz_param_pzfield}, ")
10408
10409        for pz_param_pzfield in pz_param_pzfields:
10410            query_update_concat_list.append(
10411                f"""
10412                    , CASE 
10413                        WHEN {pz_param_pzfield} IS NOT NULL
10414                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10415                        ELSE ''
10416                    END
10417                """
10418            )
10419
10420        # Order by
10421        pz_orders = (
10422            param.get("transcripts", {})
10423            .get("prioritization", {})
10424            .get("prioritization_transcripts_order", {})
10425        )
10426        if not pz_orders:
10427            pz_orders = {
10428                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10429                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10430            }
10431        for pz_order in pz_orders:
10432            query_update_order_list.append(
10433                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10434            )
10435
10436        # Fields to explode
10437        fields_to_explode = (
10438            list(pz_param_pzfields.keys())
10439            + pz_mandatory_fields
10440            + list(pz_orders.keys())
10441        )
10442        # Remove transcript column as a specific transcript column
10443        if "transcript" in fields_to_explode:
10444            fields_to_explode.remove("transcript")
10445
10446        # Fields intranscripts table
10447        query_transcripts_table = f"""
10448            DESCRIBE SELECT * FROM {transcripts_table}
10449        """
10450        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10451
10452        # Check fields to explode
10453        for field_to_explode in fields_to_explode:
10454            if field_to_explode not in self.get_header_infos_list() + list(
10455                query_transcripts_table.column_name
10456            ):
10457                msg_err = f"INFO/{field_to_explode} NOT IN header"
10458                log.error(msg_err)
10459                raise ValueError(msg_err)
10460
10461        # Explode fields to explode
10462        self.explode_infos(
10463            table=transcripts_table,
10464            fields=fields_to_explode,
10465        )
10466
10467        # Transcript preference file
10468        transcripts_preference_file = (
10469            param.get("transcripts", {})
10470            .get("prioritization", {})
10471            .get("prioritization_transcripts", {})
10472        )
10473        transcripts_preference_file = full_path(transcripts_preference_file)
10474
10475        # Transcript preference forced
10476        transcript_preference_force = (
10477            param.get("transcripts", {})
10478            .get("prioritization", {})
10479            .get("prioritization_transcripts_force", False)
10480        )
10481        # Transcript version forced
10482        transcript_version_force = (
10483            param.get("transcripts", {})
10484            .get("prioritization", {})
10485            .get("prioritization_transcripts_version_force", False)
10486        )
10487
10488        # Transcripts Ranking
10489        if transcripts_preference_file:
10490
10491            # Transcripts file to dataframe
10492            if os.path.exists(transcripts_preference_file):
10493                transcripts_preference_dataframe = transcripts_file_to_df(
10494                    transcripts_preference_file
10495                )
10496            else:
10497                log.error(
10498                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10499                )
10500                raise ValueError(
10501                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10502                )
10503
10504            # Order by depending to transcript preference forcing
10505            if transcript_preference_force:
10506                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10507            else:
10508                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10509
10510            # Transcript columns joined depend on version consideration
10511            if transcript_version_force:
10512                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10513            else:
10514                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10515
10516            # Query ranking for update
10517            query_update_ranking = f"""
10518                SELECT
10519                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10520                    ROW_NUMBER() OVER (
10521                        PARTITION BY "#CHROM", POS, REF, ALT
10522                        ORDER BY {order_by}
10523                    ) AS rn
10524                FROM {transcripts_table}
10525                LEFT JOIN 
10526                    (
10527                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10528                        FROM transcripts_preference_dataframe
10529                    ) AS transcripts_preference
10530                ON {transcripts_version_join}
10531            """
10532
10533        else:
10534
10535            # Query ranking for update
10536            query_update_ranking = f"""
10537                SELECT
10538                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10539                    ROW_NUMBER() OVER (
10540                        PARTITION BY "#CHROM", POS, REF, ALT
10541                        ORDER BY {" , ".join(query_update_order_list)}
10542                    ) AS rn
10543                FROM {transcripts_table}
10544            """
10545
10546        # Export Transcripts prioritization infos to variants table
10547        query_update = f"""
10548            WITH RankedTranscripts AS (
10549                {query_update_ranking}
10550            )
10551            UPDATE {table_variants}
10552                SET
10553                INFO = CONCAT(CASE
10554                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10555                            THEN ''
10556                            ELSE concat("INFO", ';')
10557                        END,
10558                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10559                        )
10560            FROM
10561                RankedTranscripts
10562            WHERE
10563                rn = 1
10564                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10565                AND variants."POS" = RankedTranscripts."POS"
10566                AND variants."REF" = RankedTranscripts."REF"
10567                AND variants."ALT" = RankedTranscripts."ALT"     
10568        """
10569
10570        # log.debug(f"query_update={query_update}")
10571        self.execute_query(query=query_update)
10572
10573        # Return
10574        return True
10575
10576    def create_transcript_view_from_columns_map(
10577        self,
10578        transcripts_table: str = "transcripts",
10579        columns_maps: dict = {},
10580        added_columns: list = [],
10581        temporary_tables: list = None,
10582        annotation_fields: list = None,
10583        column_rename: dict = {},
10584        column_clean: bool = False,
10585        column_case: str = None,
10586    ) -> tuple[list, list, list]:
10587        """
10588        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10589        specified columns mapping for transcripts data.
10590
10591        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10592        of the table where the transcripts data is stored or will be stored in the database. This table
10593        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10594        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10595        :type transcripts_table: str (optional)
10596        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10597        about how to map columns from a transcripts table to create a view. Each entry in the
10598        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10599        typically includes details such as the main transcript column and additional information columns
10600        :type columns_maps: dict
10601        :param added_columns: The `added_columns` parameter in the
10602        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10603        that will be added to the view being created based on the columns map provided. These columns
10604        are generated by exploding the transcript information columns along with the main transcript
10605        column
10606        :type added_columns: list
10607        :param temporary_tables: The `temporary_tables` parameter in the
10608        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10609        tables created during the process of creating a transcript view from a columns map. These
10610        temporary tables are used to store intermediate results or transformations before the final view
10611        is generated
10612        :type temporary_tables: list
10613        :param annotation_fields: The `annotation_fields` parameter in the
10614        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10615        used for annotation in the query view creation process. These fields are extracted from the
10616        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10617        :type annotation_fields: list
10618        :param column_rename: The `column_rename` parameter in the
10619        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10620        custom renaming for columns during the creation of the temporary table view. This parameter
10621        provides a mapping of original column names to the desired renamed column names. By using this
10622        parameter,
10623        :type column_rename: dict
10624        :param column_clean: The `column_clean` parameter in the
10625        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10626        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10627        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10628        False
10629        :type column_clean: bool (optional)
10630        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10631        function is used to specify the case transformation to be applied to the columns during the view
10632        creation process. It allows you to control whether the column values should be converted to
10633        lowercase, uppercase, or remain unchanged
10634        :type column_case: str
10635        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10636        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10637        """
10638
10639        log.debug("Start transcrpts view creation from columns map...")
10640
10641        # "from_columns_map": [
10642        #     {
10643        #         "transcripts_column": "Ensembl_transcriptid",
10644        #         "transcripts_infos_columns": [
10645        #             "genename",
10646        #             "Ensembl_geneid",
10647        #             "LIST_S2_score",
10648        #             "LIST_S2_pred",
10649        #         ],
10650        #     },
10651        #     {
10652        #         "transcripts_column": "Ensembl_transcriptid",
10653        #         "transcripts_infos_columns": [
10654        #             "genename",
10655        #             "VARITY_R_score",
10656        #             "Aloft_pred",
10657        #         ],
10658        #     },
10659        # ],
10660
10661        # Init
10662        if temporary_tables is None:
10663            temporary_tables = []
10664        if annotation_fields is None:
10665            annotation_fields = []
10666
10667        # Variants table
10668        table_variants = self.get_table_variants()
10669
10670        for columns_map in columns_maps:
10671
10672            # Log
10673            log.debug(f"columns_map={columns_map}")
10674
10675            # Transcript column
10676            transcripts_column = columns_map.get("transcripts_column", None)
10677
10678            # Transcripts infos columns
10679            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10680
10681            # Transcripts infos columns rename
10682            column_rename = columns_map.get("column_rename", column_rename)
10683
10684            # Transcripts infos columns clean
10685            column_clean = columns_map.get("column_clean", column_clean)
10686
10687            # Transcripts infos columns case
10688            column_case = columns_map.get("column_case", column_case)
10689
10690            if transcripts_column is not None:
10691
10692                # Explode
10693                added_columns += self.explode_infos(
10694                    fields=[transcripts_column] + transcripts_infos_columns
10695                )
10696
10697                # View clauses
10698                clause_select_variants = []
10699                clause_select_tanscripts = []
10700                for field in [transcripts_column] + transcripts_infos_columns:
10701
10702                    # AS field
10703                    as_field = field
10704
10705                    # Rename
10706                    if column_rename:
10707                        as_field = column_rename.get(as_field, as_field)
10708
10709                    # Clean
10710                    if column_clean:
10711                        as_field = clean_annotation_field(as_field)
10712
10713                    # Case
10714                    if column_case:
10715                        if column_case.lower() in ["lower"]:
10716                            as_field = as_field.lower()
10717                        elif column_case.lower() in ["upper"]:
10718                            as_field = as_field.upper()
10719
10720                    # Clause select Variants
10721                    clause_select_variants.append(
10722                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10723                    )
10724
10725                    if field in [transcripts_column]:
10726                        clause_select_tanscripts.append(
10727                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10728                        )
10729                    else:
10730                        clause_select_tanscripts.append(
10731                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10732                        )
10733                        annotation_fields.append(as_field)
10734
10735                # Query View
10736                query = f""" 
10737                    SELECT
10738                        "#CHROM", POS, REF, ALT, INFO,
10739                        "{transcripts_column}" AS 'transcript',
10740                        {", ".join(clause_select_tanscripts)}
10741                    FROM (
10742                        SELECT 
10743                            "#CHROM", POS, REF, ALT, INFO,
10744                            {", ".join(clause_select_variants)}
10745                        FROM {table_variants}
10746                        )
10747                    WHERE "{transcripts_column}" IS NOT NULL
10748                """
10749
10750                # Create temporary table
10751                temporary_table = transcripts_table + "".join(
10752                    random.choices(string.ascii_uppercase + string.digits, k=10)
10753                )
10754
10755                # Temporary view
10756                temporary_tables.append(temporary_table)
10757                query_view = f"""
10758                    CREATE view {temporary_table}
10759                    AS ({query})
10760                """
10761                self.execute_query(query=query_view)
10762
10763        return added_columns, temporary_tables, annotation_fields
10764
10765    def create_transcript_view_from_column_format(
10766        self,
10767        transcripts_table: str = "transcripts",
10768        column_formats: dict = {},
10769        temporary_tables: list = None,
10770        annotation_fields: list = None,
10771        column_rename: dict = {},
10772        column_clean: bool = False,
10773        column_case: str = None,
10774    ) -> tuple[list, list, list]:
10775        """
10776        The `create_transcript_view_from_column_format` function generates a transcript view based on
10777        specified column formats, adds additional columns and annotation fields, and returns the list of
10778        temporary tables and annotation fields.
10779
10780        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10781        of the table containing the transcripts data. This table will be used as the base table for
10782        creating the transcript view. The default value for this parameter is "transcripts", but you can
10783        provide a different table name if needed, defaults to transcripts
10784        :type transcripts_table: str (optional)
10785        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10786        about the columns to be used for creating the transcript view. Each entry in the dictionary
10787        specifies the mapping between a transcripts column and a transcripts infos column. This
10788        parameter allows you to define how the columns from the transcripts table should be transformed
10789        or mapped
10790        :type column_formats: dict
10791        :param temporary_tables: The `temporary_tables` parameter in the
10792        `create_transcript_view_from_column_format` function is a list that stores the names of
10793        temporary views created during the process of creating a transcript view from a column format.
10794        These temporary views are used to manipulate and extract data before generating the final
10795        transcript view
10796        :type temporary_tables: list
10797        :param annotation_fields: The `annotation_fields` parameter in the
10798        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10799        that are extracted from the temporary views created during the process. These annotation fields
10800        are obtained by querying the temporary views and extracting the column names excluding specific
10801        columns like `#CH
10802        :type annotation_fields: list
10803        :param column_rename: The `column_rename` parameter in the
10804        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10805        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10806        column names to new column names in this dictionary, you can rename specific columns during the
10807        process
10808        :type column_rename: dict
10809        :param column_clean: The `column_clean` parameter in the
10810        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10811        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10812        will be cleaned during the creation of the transcript view based on the specified column format,
10813        defaults to False
10814        :type column_clean: bool (optional)
10815        :param column_case: The `column_case` parameter in the
10816        `create_transcript_view_from_column_format` function is used to specify the case transformation
10817        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10818        to convert the column names to uppercase or lowercase, respectively
10819        :type column_case: str
10820        :return: The `create_transcript_view_from_column_format` function returns two lists:
10821        `temporary_tables` and `annotation_fields`.
10822        """
10823
10824        log.debug("Start transcrpts view creation from column format...")
10825
10826        #  "from_column_format": [
10827        #     {
10828        #         "transcripts_column": "ANN",
10829        #         "transcripts_infos_column": "Feature_ID",
10830        #     }
10831        # ],
10832
10833        # Init
10834        if temporary_tables is None:
10835            temporary_tables = []
10836        if annotation_fields is None:
10837            annotation_fields = []
10838
10839        added_columns = []
10840
10841        for column_format in column_formats:
10842
10843            # annotation field and transcript annotation field
10844            annotation_field = column_format.get("transcripts_column", "ANN")
10845            transcript_annotation = column_format.get(
10846                "transcripts_infos_column", "Feature_ID"
10847            )
10848
10849            # Transcripts infos columns rename
10850            column_rename = column_format.get("column_rename", column_rename)
10851
10852            # Transcripts infos columns clean
10853            column_clean = column_format.get("column_clean", column_clean)
10854
10855            # Transcripts infos columns case
10856            column_case = column_format.get("column_case", column_case)
10857
10858            # Temporary View name
10859            temporary_view_name = transcripts_table + "".join(
10860                random.choices(string.ascii_uppercase + string.digits, k=10)
10861            )
10862
10863            # Create temporary view name
10864            temporary_view_name, added_columns = self.annotation_format_to_table(
10865                annotation_field=annotation_field,
10866                view_name=temporary_view_name,
10867                annotation_id=transcript_annotation,
10868                column_rename=column_rename,
10869                column_clean=column_clean,
10870                column_case=column_case,
10871            )
10872
10873            # Annotation fields
10874            if temporary_view_name:
10875                query_annotation_fields = f"""
10876                    SELECT *
10877                    FROM (
10878                        DESCRIBE SELECT *
10879                        FROM {temporary_view_name}
10880                        )
10881                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10882                """
10883                df_annotation_fields = self.get_query_to_df(
10884                    query=query_annotation_fields
10885                )
10886
10887                # Add temporary view and annotation fields
10888                temporary_tables.append(temporary_view_name)
10889                annotation_fields += list(set(df_annotation_fields["column_name"]))
10890
10891        return added_columns, temporary_tables, annotation_fields
10892
10893    def create_transcript_view(
10894        self,
10895        transcripts_table: str = None,
10896        transcripts_table_drop: bool = False,
10897        param: dict = {},
10898    ) -> str:
10899        """
10900        The `create_transcript_view` function generates a transcript view by processing data from a
10901        specified table based on provided parameters and structural information.
10902
10903        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10904        is used to specify the name of the table that will store the final transcript view data. If a table
10905        name is not provided, the function will create a new table to store the transcript view data, and by
10906        default,, defaults to transcripts
10907        :type transcripts_table: str (optional)
10908        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10909        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10910        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10911        the function will drop the existing transcripts table if it exists, defaults to False
10912        :type transcripts_table_drop: bool (optional)
10913        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10914        contains information needed to create a transcript view. It includes details such as the structure
10915        of the transcripts, columns mapping, column formats, and other necessary information for generating
10916        the view. This parameter allows for flexibility and customization
10917        :type param: dict
10918        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10919        created or modified during the execution of the function.
10920        """
10921
10922        log.debug("Start transcripts view creation...")
10923
10924        # Default
10925        transcripts_table_default = "transcripts"
10926
10927        # Param
10928        if not param:
10929            param = self.get_param()
10930
10931        # Struct
10932        struct = param.get("transcripts", {}).get("struct", None)
10933
10934        # Transcript veresion
10935        transcript_id_remove_version = param.get("transcripts", {}).get(
10936            "transcript_id_remove_version", False
10937        )
10938
10939        # Transcripts mapping
10940        transcript_id_mapping_file = param.get("transcripts", {}).get(
10941            "transcript_id_mapping_file", None
10942        )
10943
10944        # Transcripts mapping
10945        transcript_id_mapping_force = param.get("transcripts", {}).get(
10946            "transcript_id_mapping_force", None
10947        )
10948
10949        # Transcripts table
10950        if transcripts_table is None:
10951            transcripts_table = param.get("transcripts", {}).get(
10952                "table", transcripts_table_default
10953            )
10954
10955        # Check transcripts table exists
10956        if transcripts_table:
10957
10958            # Query to check if transcripts table exists
10959            query_check_table = f"""
10960                SELECT * 
10961                FROM information_schema.tables 
10962                WHERE table_name = '{transcripts_table}'
10963            """
10964            df_check_table = self.get_query_to_df(query=query_check_table)
10965
10966            # Check if transcripts table exists
10967            if len(df_check_table) > 0 and not transcripts_table_drop:
10968                log.debug(f"Table {transcripts_table} exists and not drop option")
10969                return transcripts_table
10970
10971        if struct:
10972
10973            # added_columns
10974            added_columns = []
10975
10976            # Temporary tables
10977            temporary_tables = []
10978
10979            # Annotation fields
10980            annotation_fields = []
10981
10982            # from columns map
10983            columns_maps = struct.get("from_columns_map", [])
10984            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10985                self.create_transcript_view_from_columns_map(
10986                    transcripts_table=transcripts_table,
10987                    columns_maps=columns_maps,
10988                    added_columns=added_columns,
10989                    temporary_tables=temporary_tables,
10990                    annotation_fields=annotation_fields,
10991                )
10992            )
10993            added_columns += added_columns_tmp
10994            temporary_tables += temporary_tables_tmp
10995            annotation_fields += annotation_fields_tmp
10996
10997            # from column format
10998            column_formats = struct.get("from_column_format", [])
10999            added_columns, temporary_tables_tmp, annotation_fields_tmp = (
11000                self.create_transcript_view_from_column_format(
11001                    transcripts_table=transcripts_table,
11002                    column_formats=column_formats,
11003                    temporary_tables=temporary_tables,
11004                    annotation_fields=annotation_fields,
11005                )
11006            )
11007            added_columns += added_columns_tmp
11008            temporary_tables += temporary_tables_tmp
11009            annotation_fields += annotation_fields_tmp
11010
11011            # Remove some specific fields/column
11012            annotation_fields = list(set(annotation_fields))
11013            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
11014                if field in annotation_fields:
11015                    annotation_fields.remove(field)
11016
11017            # Merge temporary tables query
11018            query_merge = ""
11019            for temporary_table in list(set(temporary_tables)):
11020
11021                # First temporary table
11022                if not query_merge:
11023                    query_merge = f"""
11024                        SELECT * FROM {temporary_table}
11025                    """
11026                # other temporary table (using UNION)
11027                else:
11028                    query_merge += f"""
11029                        UNION BY NAME SELECT * FROM {temporary_table}
11030                    """
11031
11032            # transcript table tmp
11033            transcript_table_tmp = "transcripts_tmp"
11034            transcript_table_tmp2 = "transcripts_tmp2"
11035            transcript_table_tmp3 = "transcripts_tmp3"
11036
11037            # Merge on transcript
11038            query_merge_on_transcripts_annotation_fields = []
11039
11040            # Add transcript list
11041            query_merge_on_transcripts_annotation_fields.append(
11042                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
11043            )
11044
11045            # Aggregate all annotations fields
11046            for annotation_field in set(annotation_fields):
11047                query_merge_on_transcripts_annotation_fields.append(
11048                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
11049                )
11050
11051            # Transcripts mapping
11052            if transcript_id_mapping_file:
11053
11054                # Transcript dataframe
11055                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
11056                transcript_id_mapping_dataframe = transcripts_file_to_df(
11057                    transcript_id_mapping_file, column_names=["transcript", "alias"]
11058                )
11059
11060                # Transcript version remove
11061                if transcript_id_remove_version:
11062                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
11063                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
11064                    query_left_join = f"""
11065                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11066                    """
11067                else:
11068                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
11069                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
11070                    query_left_join = f"""
11071                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11072                    """
11073
11074                # Transcript column for group by merge
11075                query_transcript_merge_group_by = """
11076                        CASE
11077                            WHEN transcript_mapped NOT IN ('')
11078                            THEN split_part(transcript_mapped, '.', 1)
11079                            ELSE split_part(transcript_original, '.', 1)
11080                        END
11081                    """
11082
11083                # Merge query
11084                transcripts_tmp2_query = f"""
11085                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
11086                    FROM ({query_merge}) AS {transcript_table_tmp}
11087                    {query_left_join}
11088                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
11089                """
11090
11091                # Retrive columns after mege
11092                transcripts_tmp2_describe_query = f"""
11093                    DESCRIBE {transcripts_tmp2_query}
11094                """
11095                transcripts_tmp2_describe_list = list(
11096                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
11097                        "column_name"
11098                    ]
11099                )
11100
11101                # Create list of columns for select clause
11102                transcripts_tmp2_describe_select_clause = []
11103                for field in transcripts_tmp2_describe_list:
11104                    if field not in [
11105                        "#CHROM",
11106                        "POS",
11107                        "REF",
11108                        "ALT",
11109                        "INFO",
11110                        "transcript_mapped",
11111                    ]:
11112                        as_field = field
11113                        if field in ["transcript_original"]:
11114                            as_field = "transcripts_mapped"
11115                        transcripts_tmp2_describe_select_clause.append(
11116                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11117                        )
11118
11119                # Merge with mapping
11120                query_merge_on_transcripts = f"""
11121                    SELECT
11122                        "#CHROM", POS, REF, ALT, INFO,
11123                        CASE
11124                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11125                            THEN ANY_VALUE(transcript_mapped)
11126                            ELSE ANY_VALUE(transcript_original)
11127                        END AS transcript,
11128                        {", ".join(transcripts_tmp2_describe_select_clause)}
11129                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11130                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11131                        {query_transcript_merge_group_by}
11132                """
11133
11134                # Add transcript filter from mapping file
11135                if transcript_id_mapping_force:
11136                    query_merge_on_transcripts = f"""
11137                        SELECT *
11138                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11139                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11140                    """
11141
11142            # No transcript mapping
11143            else:
11144
11145                # Remove transcript version
11146                if transcript_id_remove_version:
11147                    query_transcript_column = f"""
11148                        split_part({transcript_table_tmp}.transcript, '.', 1)
11149                    """
11150                else:
11151                    query_transcript_column = """
11152                        transcript
11153                    """
11154
11155                # Query sections
11156                query_transcript_column_select = (
11157                    f"{query_transcript_column} AS transcript"
11158                )
11159                query_transcript_column_group_by = query_transcript_column
11160
11161                # Query for transcripts view
11162                query_merge_on_transcripts = f"""
11163                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11164                    FROM ({query_merge}) AS {transcript_table_tmp}
11165                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11166                """
11167
11168            # Drop transcript view is necessary
11169            if transcripts_table_drop:
11170                query_drop = f"""
11171                    DROP TABLE IF EXISTS {transcripts_table};
11172                """
11173                self.execute_query(query=query_drop)
11174
11175            # List of unique #CHROM
11176            query_unique_chrom = f"""
11177                SELECT DISTINCT "#CHROM"
11178                FROM variants AS subquery
11179            """
11180            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11181
11182            # Create table with structure but without data, if not exists
11183            query_create_table = f"""
11184                CREATE TABLE IF NOT EXISTS {transcripts_table} AS
11185                SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0
11186            """
11187            self.execute_query(query=query_create_table)
11188
11189            # Process by #CHROM
11190            for chrom in unique_chroms["#CHROM"]:
11191
11192                # Log
11193                log.debug(f"Processing #CHROM={chrom}")
11194
11195                # Select data by #CHROM
11196                query_chunk = f"""
11197                    SELECT *
11198                    FROM ({query_merge_on_transcripts})
11199                    WHERE "#CHROM" = '{chrom}'
11200                """
11201
11202                # Insert data
11203                query_insert_chunk = f"""
11204                    INSERT INTO {transcripts_table}
11205                    {query_chunk}
11206                """
11207                self.execute_query(query=query_insert_chunk)
11208
11209            # Remove temporary tables
11210            if temporary_tables:
11211                for temporary_table in list(set(temporary_tables)):
11212                    try:
11213                        query_drop_tmp_table = f"""
11214                            DROP TABLE IF EXISTS {temporary_table}
11215                        """
11216                        self.execute_query(query=query_drop_tmp_table)
11217                    except Exception as e:
11218                        log.debug(f"'{temporary_table}' Not a table")
11219                    try:
11220                        query_drop_tmp_table = f"""
11221                            DROP VIEW IF EXISTS {temporary_table}
11222                        """
11223                        self.execute_query(query=query_drop_tmp_table)
11224                    except Exception as e:
11225                        log.debug(f"'{temporary_table}' Not a view")
11226
11227            # Remove added columns
11228            for added_column in added_columns:
11229                self.drop_column(column=added_column)
11230
11231        else:
11232
11233            transcripts_table = None
11234
11235        return transcripts_table
11236
11237    def annotation_format_to_table(
11238        self,
11239        annotation_field: str = "ANN",
11240        annotation_id: str = "Feature_ID",
11241        view_name: str = "transcripts",
11242        column_rename: dict = {},
11243        column_clean: bool = False,
11244        column_case: str = None,
11245    ) -> str:
11246        """
11247        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11248        structured table format, ensuring unique values and creating a temporary table for further
11249        processing or analysis.
11250
11251        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11252        unique values in the output or not. If set to `True`, the function will make sure that the
11253        output values are unique, defaults to True
11254        :type uniquify: bool (optional)
11255        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11256        that contains the annotation information for each variant. This field is used to extract the
11257        annotation details for further processing in the function. By default, it is set to "ANN",
11258        defaults to ANN
11259        :type annotation_field: str (optional)
11260        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11261        is used to specify the identifier for the annotation feature. This identifier will be used as a
11262        column name in the resulting table or view that is created based on the annotation data. It
11263        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11264        :type annotation_id: str (optional)
11265        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11266        to specify the name of the temporary table that will be created to store the transformed
11267        annotation data. This table will hold the extracted information from the annotation field in a
11268        structured format for further processing or analysis. By default,, defaults to transcripts
11269        :type view_name: str (optional)
11270        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11271        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11272        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11273        created based on the annotation data. This feature enables
11274        :type column_rename: dict
11275        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11276        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11277        If set to `True`, the function will clean the annotation field before further processing. This
11278        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11279        to False
11280        :type column_clean: bool (optional)
11281        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11282        used to specify the case transformation to be applied to the column names extracted from the
11283        annotation data. It allows you to set the case of the column names to either lowercase or
11284        uppercase for consistency or other specific requirements during the conversion
11285        :type column_case: str
11286        :return: The function `annotation_format_to_table` is returning the name of the view created,
11287        which is stored in the variable `view_name`.
11288        """
11289
11290        # Transcript annotation
11291        if column_rename:
11292            annotation_id = column_rename.get(annotation_id, annotation_id)
11293
11294        if column_clean:
11295            annotation_id = clean_annotation_field(annotation_id)
11296
11297        # Prefix
11298        prefix = self.get_explode_infos_prefix()
11299        if prefix:
11300            prefix = "INFO/"
11301
11302        # Variants table
11303        table_variants = self.get_table_variants()
11304
11305        # Header
11306        vcf_reader = self.get_header()
11307
11308        # Add columns
11309        added_columns = []
11310
11311        # Explode HGVS field in column
11312        added_columns += self.explode_infos(fields=[annotation_field])
11313
11314        if annotation_field in vcf_reader.infos:
11315
11316            # Extract ANN header
11317            ann_description = vcf_reader.infos[annotation_field].desc
11318            pattern = r"'(.+?)'"
11319            match = re.search(pattern, ann_description)
11320            if match:
11321                ann_header_match = match.group(1).split(" | ")
11322                ann_header = []
11323                ann_header_desc = {}
11324                for i in range(len(ann_header_match)):
11325                    ann_header_info = "".join(
11326                        char for char in ann_header_match[i] if char.isalnum()
11327                    )
11328                    ann_header.append(ann_header_info)
11329                    ann_header_desc[ann_header_info] = ann_header_match[i]
11330                if not ann_header_desc:
11331                    raise ValueError("Invalid header description format")
11332            else:
11333                raise ValueError("Invalid header description format")
11334
11335            # Create dataframe for keys column type
11336            dataframe_annotation_format = self.get_query_to_df(
11337                f""" 
11338                WITH exploded_annotations AS (
11339                    SELECT
11340                        UNNEST(STRING_SPLIT(ANN, ',')) AS annotation
11341                    FROM {table_variants}
11342                ),
11343                split_annotations AS (
11344                    SELECT
11345                        {", ".join([f"SPLIT_PART(annotation, '|', {i+1}) AS '{header}'" for i, header in enumerate(ann_header_desc.values())])},
11346                    FROM exploded_annotations
11347                )
11348                SELECT * FROM split_annotations
11349                LIMIT 1000
11350                """
11351            )
11352
11353            # Init
11354            query_list_keys = []
11355            key_i = 0
11356
11357            for key in dataframe_annotation_format.keys():
11358
11359                # Key
11360                key_i += 1
11361                key_clean = key
11362
11363                # key rename
11364                if column_rename:
11365                    key_clean = column_rename.get(key_clean, key_clean)
11366
11367                # key clean
11368                if column_clean:
11369                    key_clean = clean_annotation_field(key_clean)
11370
11371                # Key case
11372                if column_case:
11373                    if column_case.lower() in ["lower"]:
11374                        key_clean = key_clean.lower()
11375                    elif column_case.lower() in ["upper"]:
11376                        key_clean = key_clean.upper()
11377
11378                # Detect column type
11379                column_type = detect_column_type(dataframe_annotation_format[key])
11380
11381                # Append key to list
11382                query_list_keys.append(
11383                    f""" NULLIF(SPLIT_PART(annotation, '|', {key_i}), '')::{column_type} AS '{prefix}{key_clean}' """
11384                )
11385
11386            # Create temporary table
11387            query_create_view = f"""
11388                CREATE VIEW {view_name} AS (
11389                    WITH exploded_annotations AS (
11390                        SELECT
11391                            "#CHROM",
11392                            POS,
11393                            REF,
11394                            ALT,
11395                            INFO,
11396                            UNNEST(STRING_SPLIT(ANN, ',')) AS annotation
11397                        FROM {table_variants}
11398                    ),
11399                    split_annotations AS (
11400                        SELECT
11401                            "#CHROM",
11402                            POS,
11403                            REF,
11404                            ALT,
11405                            INFO,
11406                            {", ".join(query_list_keys)},
11407                        FROM exploded_annotations
11408                    )
11409                    SELECT *, {annotation_id} AS 'transcript' FROM split_annotations
11410                )
11411            """
11412            log.debug(f"query_create_view: {query_create_view}")
11413            self.execute_query(query=query_create_view)
11414
11415        else:
11416
11417            # Return None
11418            view_name = None
11419
11420        return view_name, added_columns
11421
11422    def transcript_view_to_variants(
11423        self,
11424        transcripts_table: str = None,
11425        transcripts_column_id: str = None,
11426        transcripts_info_json: str = None,
11427        transcripts_info_field_json: str = None,
11428        transcripts_info_format: str = None,
11429        transcripts_info_field_format: str = None,
11430        param: dict = {},
11431    ) -> bool:
11432        """
11433        The `transcript_view_to_variants` function updates a variants table with information from
11434        transcripts in JSON format.
11435
11436        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11437        table containing the transcripts data. If this parameter is not provided, the function will
11438        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11439        :type transcripts_table: str
11440        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11441        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11442        identifier is used to match transcripts with variants in the database
11443        :type transcripts_column_id: str
11444        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11445        of the column in the variants table where the transcripts information will be stored in JSON
11446        format. This parameter allows you to define the column in the variants table that will hold the
11447        JSON-formatted information about transcripts
11448        :type transcripts_info_json: str
11449        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11450        specify the field in the VCF header that will contain information about transcripts in JSON
11451        format. This field will be added to the VCF header as an INFO field with the specified name
11452        :type transcripts_info_field_json: str
11453        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11454        format of the information about transcripts that will be stored in the variants table. This
11455        format can be used to define how the transcript information will be structured or displayed
11456        within the variants table
11457        :type transcripts_info_format: str
11458        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11459        specify the field in the VCF header that will contain information about transcripts in a
11460        specific format. This field will be added to the VCF header as an INFO field with the specified
11461        name
11462        :type transcripts_info_field_format: str
11463        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11464        that contains various configuration settings related to transcripts. It is used to provide
11465        default values for certain parameters if they are not explicitly provided when calling the
11466        method. The `param` dictionary can be passed as an argument
11467        :type param: dict
11468        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11469        if the operation is successful and `False` if certain conditions are not met.
11470        """
11471
11472        msg_info_prefix = "Start transcripts view to variants annotations"
11473
11474        log.debug(f"{msg_info_prefix}...")
11475
11476        # Default
11477        transcripts_table_default = "transcripts"
11478        transcripts_column_id_default = "transcript"
11479        transcripts_info_json_default = None
11480        transcripts_info_format_default = None
11481        transcripts_info_field_json_default = None
11482        transcripts_info_field_format_default = None
11483
11484        # Param
11485        if not param:
11486            param = self.get_param()
11487
11488        # Transcripts table
11489        if transcripts_table is None:
11490            transcripts_table = param.get("transcripts", {}).get(
11491                "table", transcripts_table_default
11492            )
11493
11494        # Transcripts column ID
11495        if transcripts_column_id is None:
11496            transcripts_column_id = param.get("transcripts", {}).get(
11497                "column_id", transcripts_column_id_default
11498            )
11499
11500        # Transcripts info json
11501        if transcripts_info_json is None:
11502            transcripts_info_json = param.get("transcripts", {}).get(
11503                "transcripts_info_json", transcripts_info_json_default
11504            )
11505
11506        # Transcripts info field JSON
11507        if transcripts_info_field_json is None:
11508            transcripts_info_field_json = param.get("transcripts", {}).get(
11509                "transcripts_info_field_json", transcripts_info_field_json_default
11510            )
11511        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11512        #     transcripts_info_json = transcripts_info_field_json
11513
11514        # Transcripts info format
11515        if transcripts_info_format is None:
11516            transcripts_info_format = param.get("transcripts", {}).get(
11517                "transcripts_info_format", transcripts_info_format_default
11518            )
11519
11520        # Transcripts info field FORMAT
11521        if transcripts_info_field_format is None:
11522            transcripts_info_field_format = param.get("transcripts", {}).get(
11523                "transcripts_info_field_format", transcripts_info_field_format_default
11524            )
11525        # if (
11526        #     transcripts_info_field_format is not None
11527        #     and transcripts_info_format is None
11528        # ):
11529        #     transcripts_info_format = transcripts_info_field_format
11530
11531        # Variants table
11532        table_variants = self.get_table_variants()
11533
11534        # Check info columns param
11535        if (
11536            transcripts_info_json is None
11537            and transcripts_info_field_json is None
11538            and transcripts_info_format is None
11539            and transcripts_info_field_format is None
11540        ):
11541            return False
11542
11543        # Transcripts infos columns
11544        query_transcripts_infos_columns = f"""
11545            SELECT *
11546            FROM (
11547                DESCRIBE SELECT * FROM {transcripts_table}
11548                )
11549            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11550        """
11551        transcripts_infos_columns = list(
11552            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11553        )
11554
11555        # View results
11556        clause_select = []
11557        clause_to_json = []
11558        clause_to_format = []
11559        for field in transcripts_infos_columns:
11560            # Do not consider INFO field for export into fields
11561            if field not in ["INFO"]:
11562                clause_select.append(
11563                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11564                )
11565                clause_to_json.append(f""" '{field}': "{field}" """)
11566                clause_to_format.append(f""" "{field}" """)
11567
11568        # Update
11569        update_set_json = []
11570        update_set_format = []
11571
11572        # VCF header
11573        vcf_reader = self.get_header()
11574
11575        # Transcripts to info column in JSON
11576        if transcripts_info_json:
11577
11578            # Create column on variants table
11579            self.add_column(
11580                table_name=table_variants,
11581                column_name=transcripts_info_json,
11582                column_type="JSON",
11583                default_value=None,
11584                drop=False,
11585            )
11586
11587            # Add header
11588            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11589                transcripts_info_json,
11590                ".",
11591                "String",
11592                "Transcripts in JSON format",
11593                "unknwon",
11594                "unknwon",
11595                self.code_type_map["String"],
11596            )
11597
11598            # Add to update
11599            update_set_json.append(
11600                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11601            )
11602
11603        # Transcripts to info field in JSON
11604        if transcripts_info_field_json:
11605
11606            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11607
11608            # Add to update
11609            update_set_json.append(
11610                f""" 
11611                    INFO = concat(
11612                            CASE
11613                                WHEN INFO NOT IN ('', '.')
11614                                THEN INFO
11615                                ELSE ''
11616                            END,
11617                            CASE
11618                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11619                                THEN concat(
11620                                    ';{transcripts_info_field_json}=',
11621                                    t.{transcripts_info_json}
11622                                )
11623                                ELSE ''
11624                            END
11625                            )
11626                """
11627            )
11628
11629            # Add header
11630            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11631                transcripts_info_field_json,
11632                ".",
11633                "String",
11634                "Transcripts in JSON format",
11635                "unknwon",
11636                "unknwon",
11637                self.code_type_map["String"],
11638            )
11639
11640        if update_set_json:
11641
11642            # Update query
11643            query_update = f"""
11644                UPDATE {table_variants}
11645                    SET {", ".join(update_set_json)}
11646                FROM
11647                (
11648                    SELECT
11649                        "#CHROM", POS, REF, ALT,
11650                            concat(
11651                            '{{',
11652                            string_agg(
11653                                '"' || "{transcripts_column_id}" || '":' ||
11654                                to_json(json_output)
11655                            ),
11656                            '}}'
11657                            )::JSON AS {transcripts_info_json}
11658                    FROM
11659                        (
11660                        SELECT
11661                            "#CHROM", POS, REF, ALT,
11662                            "{transcripts_column_id}",
11663                            to_json(
11664                                {{{",".join(clause_to_json)}}}
11665                            )::JSON AS json_output
11666                        FROM
11667                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11668                        WHERE "{transcripts_column_id}" IS NOT NULL
11669                        )
11670                    GROUP BY "#CHROM", POS, REF, ALT
11671                ) AS t
11672                WHERE {table_variants}."#CHROM" = t."#CHROM"
11673                    AND {table_variants}."POS" = t."POS"
11674                    AND {table_variants}."REF" = t."REF"
11675                    AND {table_variants}."ALT" = t."ALT"
11676            """
11677
11678            self.execute_query(query=query_update)
11679
11680        # Transcripts to info column in FORMAT
11681        if transcripts_info_format:
11682
11683            # Create column on variants table
11684            self.add_column(
11685                table_name=table_variants,
11686                column_name=transcripts_info_format,
11687                column_type="VARCHAR",
11688                default_value=None,
11689                drop=False,
11690            )
11691
11692            # Add header
11693            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11694                transcripts_info_format,
11695                ".",
11696                "String",
11697                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11698                "unknwon",
11699                "unknwon",
11700                self.code_type_map["String"],
11701            )
11702
11703            # Add to update
11704            update_set_format.append(
11705                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11706            )
11707
11708        else:
11709
11710            # Set variable for internal queries
11711            transcripts_info_format = "transcripts_info_format"
11712
11713        # Transcripts to info field in JSON
11714        if transcripts_info_field_format:
11715
11716            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11717
11718            # Add to update
11719            update_set_format.append(
11720                f""" 
11721                    INFO = concat(
11722                            CASE
11723                                WHEN INFO NOT IN ('', '.')
11724                                THEN INFO
11725                                ELSE ''
11726                            END,
11727                            CASE
11728                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11729                                THEN concat(
11730                                    ';{transcripts_info_field_format}=',
11731                                    t.{transcripts_info_format}
11732                                )
11733                                ELSE ''
11734                            END
11735                            )
11736                """
11737            )
11738
11739            # Add header
11740            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11741                transcripts_info_field_format,
11742                ".",
11743                "String",
11744                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11745                "unknwon",
11746                "unknwon",
11747                self.code_type_map["String"],
11748            )
11749
11750        if update_set_format:
11751
11752            # Update query
11753            query_update = f"""
11754                UPDATE {table_variants}
11755                    SET {", ".join(update_set_format)}
11756                FROM
11757                (
11758                    SELECT
11759                        "#CHROM", POS, REF, ALT,
11760                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11761                    FROM 
11762                        (
11763                        SELECT
11764                            "#CHROM", POS, REF, ALT,
11765                            "{transcripts_column_id}",
11766                            concat(
11767                                "{transcripts_column_id}",
11768                                '|',
11769                                {", '|', ".join(clause_to_format)}
11770                            ) AS {transcripts_info_format}
11771                        FROM
11772                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11773                        )
11774                    GROUP BY "#CHROM", POS, REF, ALT
11775                ) AS t
11776                WHERE {table_variants}."#CHROM" = t."#CHROM"
11777                    AND {table_variants}."POS" = t."POS"
11778                    AND {table_variants}."REF" = t."REF"
11779                    AND {table_variants}."ALT" = t."ALT"
11780            """
11781
11782            self.execute_query(query=query_update)
11783
11784        return True
11785
11786    def rename_info_fields(
11787        self, fields_to_rename: dict = None, table: str = None
11788    ) -> dict:
11789        """
11790        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11791        corresponding INFO fields in the variants table.
11792
11793        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11794        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11795        represent the original field names that need to be renamed, and the corresponding values
11796        represent the new names to which the fields should be
11797        :type fields_to_rename: dict
11798        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11799        the table in which the variants data is stored. This table contains information about genetic
11800        variants, and the function updates the corresponding INFO fields in this table when renaming
11801        specified fields in the VCF file header
11802        :type table: str
11803        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11804        the original field names as keys and their corresponding new names (or None if the field was
11805        removed) as values after renaming or removing specified fields in a VCF file header and updating
11806        corresponding INFO fields in the variants table.
11807        """
11808
11809        # Init
11810        fields_renamed = {}
11811        config = self.get_config()
11812        access = config.get("access")
11813
11814        if table is None:
11815            table = self.get_table_variants()
11816
11817        # regexp replace fonction
11818        regex_replace_dict = {}
11819        regex_replace_nb = 0
11820        regex_replace_partition = 125
11821        regex_replace = "concat(INFO, ';')"  # Add ';' to reduce regexp comlexity
11822
11823        if fields_to_rename is not None and access not in ["RO"]:
11824
11825            log.info("Rename or remove fields...")
11826
11827            # Header
11828            header = self.get_header()
11829
11830            for field_to_rename, field_renamed in fields_to_rename.items():
11831
11832                if field_to_rename in header.infos:
11833
11834                    # Rename header
11835                    if field_renamed is not None:
11836                        header.infos[field_renamed] = vcf.parser._Info(
11837                            field_renamed,
11838                            header.infos[field_to_rename].num,
11839                            header.infos[field_to_rename].type,
11840                            header.infos[field_to_rename].desc,
11841                            header.infos[field_to_rename].source,
11842                            header.infos[field_to_rename].version,
11843                            header.infos[field_to_rename].type_code,
11844                        )
11845                    del header.infos[field_to_rename]
11846
11847                    # Rename INFO patterns
11848                    field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;"
11849                    if field_renamed is not None:
11850                        field_renamed_pattern = rf"\1{field_renamed}\3;"
11851                    else:
11852                        field_renamed_pattern = r"\1"
11853
11854                    # regexp replace
11855                    regex_replace_nb += 1
11856                    regex_replace_key = math.floor(
11857                        regex_replace_nb / regex_replace_partition
11858                    )
11859                    if (regex_replace_nb % regex_replace_partition) == 0:
11860                        regex_replace = "concat(INFO, ';')"
11861                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11862                    regex_replace_dict[regex_replace_key] = regex_replace
11863
11864                    # Return
11865                    fields_renamed[field_to_rename] = field_renamed
11866
11867                    # Log
11868                    if field_renamed is not None:
11869                        log.info(
11870                            f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'"
11871                        )
11872                    else:
11873                        log.info(
11874                            f"Rename or remove fields - field '{field_to_rename}' removed"
11875                        )
11876
11877                else:
11878
11879                    log.warning(
11880                        f"Rename or remove fields - field '{field_to_rename}' not in header"
11881                    )
11882
11883            # Rename INFO
11884            for regex_replace_key, regex_replace in regex_replace_dict.items():
11885                log.info(
11886                    f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..."
11887                )
11888                query = f"""
11889                    UPDATE {table}
11890                    SET
11891                        INFO = regexp_replace({regex_replace}, ';$', '')
11892                """
11893                log.debug(f"query={query}")
11894                self.execute_query(query=query)
11895
11896        return fields_renamed
11897
11898    def calculation_rename_info_fields(
11899        self,
11900        fields_to_rename: dict = None,
11901        table: str = None,
11902        operation_name: str = "RENAME_INFO_FIELDS",
11903    ) -> None:
11904        """
11905        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11906        fields to rename and table if provided, and then calls another function to rename the fields.
11907
11908        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11909        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11910        the key and the new field name as the value
11911        :type fields_to_rename: dict
11912        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11913        specify the name of the table for which the fields are to be renamed. It is a string type
11914        parameter
11915        :type table: str
11916        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11917        method is a string that specifies the name of the operation being performed. In this context, it
11918        is used as a default value for the operation name if not explicitly provided when calling the
11919        function, defaults to RENAME_INFO_FIELDS
11920        :type operation_name: str (optional)
11921        """
11922
11923        # Param
11924        param = self.get_param()
11925
11926        # Get param fields to rename
11927        param_fields_to_rename = (
11928            param.get("calculation", {})
11929            .get("calculations", {})
11930            .get(operation_name, {})
11931            .get("fields_to_rename", None)
11932        )
11933
11934        # Get param table
11935        param_table = (
11936            param.get("calculation", {})
11937            .get("calculations", {})
11938            .get(operation_name, {})
11939            .get("table", None)
11940        )
11941
11942        # Init fields_to_rename
11943        if fields_to_rename is None:
11944            fields_to_rename = param_fields_to_rename
11945
11946        # Init table
11947        if table is None:
11948            table = param_table
11949
11950        renamed_fields = self.rename_info_fields(
11951            fields_to_rename=fields_to_rename, table=table
11952        )
11953
11954        log.debug(f"renamed_fields:{renamed_fields}")
11955
11956    def create_annotations_view(
11957        self,
11958        table: str = None,
11959        view: str = None,
11960        view_type: str = None,
11961        fields: list = None,
11962        prefix: str = "",
11963        drop_view: bool = False,
11964        fields_to_rename: dict = None,
11965        limit: int = None,
11966    ) -> str:
11967        """
11968        The `create_annotations_view` function creates a SQL view from fields in a VCF INFO column.
11969
11970        :param table: The `table` parameter in the `create_annotations_view` function is used to specify
11971        the name of the table from which the fields are to be extracted. This table contains the
11972        variants data, and the function creates a view based on the fields in the INFO column of this
11973        table
11974        :type table: str
11975        :param view: The `view` parameter in the `create_annotations_view` function is used to specify
11976        the name of the view that will be created based on the fields in the VCF INFO column. This view
11977        will contain the extracted fields from the INFO column in a structured format for further
11978        processing or analysis
11979        :type view: str
11980        :param view_type: The `view_type` parameter in the `create_annotations_view` function is used to
11981        specify the type of view that will be created. It can be either a `VIEW` or a `TABLE`, and the
11982        function will create the view based on the specified type
11983        :type view_type: str
11984        :param fields: The `fields` parameter in the `create_annotations_view` function is a list that
11985        contains the names of the fields to be extracted from the INFO column in the VCF file. These
11986        fields will be used to create the view with the specified columns and data extracted from the
11987        INFO column
11988        :type fields: list
11989        :param prefix: The `prefix` parameter in the `create_annotations_view` function is used to
11990        specify a prefix that will be added to the field names in the view. This prefix helps in
11991        distinguishing the fields extracted from the INFO column in the view
11992        :type prefix: str
11993        :param drop_view: The `drop_view` parameter in the `create_annotations_view` function is a boolean
11994        flag that determines whether to drop the existing view with the same name before creating a new
11995        view. If set to `True`, the function will drop the existing view before creating a new view with
11996        the specified name
11997        :type drop_view: bool
11998        :param fields_to_rename: The `fields_to_rename` parameter in the `create_annotations_view`
11999        function is a dictionary that contains the mapping of fields to be renamed in the VCF file. The
12000        keys in the dictionary represent the original field names that need to be renamed, and the
12001        corresponding values represent the new names to which the fields should be
12002        :type fields_to_rename: dict
12003        :param limit: The `limit` parameter in the `create_annotations_view` function is an integer that
12004        specifies the maximum number of rows to be included in the view. If provided, the function will
12005        limit the number of rows in the view to the specified value
12006        :type limit: int
12007        :return: The `create_annotations_view` function returns the name of the view that is created
12008        based on the fields extracted from the INFO column in the VCF file. This view contains the
12009        extracted fields in a structured format for further processing or analysis
12010        """
12011
12012        # Create a sql view from fields in VCF INFO column, with each column is a field present in the VCF header (with a specific type from VCF header) and extracted from INFO column (with a regexp like in rename_info_fields), and each row is a variant.
12013
12014        # Get table
12015        if table is None:
12016            table = self.get_table_variants()
12017
12018        # Get view
12019        if view is None:
12020            view = f"{table}_annotations"
12021
12022        # Get view type
12023        if view_type is None:
12024            view_type = "VIEW"
12025
12026        # Check view type value
12027        if view_type.upper() not in ["VIEW", "TABLE"]:
12028            raise ValueError(
12029                f"Invalid view type value: {view_type}. Either 'VIEW' or 'TABLE'"
12030            )
12031
12032        # Get header
12033        header = self.get_header()
12034
12035        # Get fields
12036        if fields is None:
12037            fields = list(header.infos.keys())
12038
12039        # Get fields to rename
12040        if fields_to_rename is None:
12041            fields_to_rename = {}
12042
12043        log.info(
12044            f"Create '{view}' view (as '{view_type}') from table '{table}' with {len(fields)} fields"
12045        )
12046
12047        # Describe table
12048        table_describe_query = f"""
12049            DESCRIBE {table}
12050        """
12051        table_describe = self.get_query_to_df(query=table_describe_query)
12052
12053        # Create fields for annotation view extracted from INFO column in table variants (with regexp_replace like in rename_info_fields), with column type from VCF header
12054        fields_columns = []
12055        fields_needed = ["#CHROM", "POS", "REF", "ALT"]
12056        field_sql_type_list = False
12057        for field in fields:
12058
12059            # Rename field
12060            field_to_rename = fields_to_rename.get(field, field)
12061
12062            # Check field type
12063
12064            # Needed fields
12065            if field in fields_needed:
12066                continue
12067
12068            # Fields in table
12069            elif field in list(table_describe.get("column_name")):
12070                fields_columns.append(f""" "{field}" AS '{prefix}{field_to_rename}' """)
12071
12072            # Fields in header
12073            elif field in header.infos:
12074
12075                # Field info
12076                field_infos = header.infos.get(field, None)
12077
12078                # Field SQL type
12079                field_sql_type = code_type_map_to_sql.get(field_infos.type, "VARCHAR")
12080
12081                # Column is a list
12082                if field_infos.num != 1:
12083                    field_sql_type_list = True
12084
12085                # Colonne is a flag
12086                if field_infos.type == "Flag":
12087                    field_pattern = rf"(^|;)({field})([^;]*)?"
12088                    fields_columns.append(
12089                        f""" regexp_matches("INFO", '{field_pattern}')::BOOLEAN AS '{prefix}{field_to_rename}' """
12090                    )
12091
12092                # Colonne with a type
12093                else:
12094
12095                    # Field pattern
12096                    field_pattern = rf"(^|;)({field})=([^;]*)?"
12097
12098                    # Field is a list
12099                    if field_sql_type_list:
12100                        fields_columns.append(
12101                            f""" CAST(list_transform(string_split(NULLIF(regexp_extract("INFO", '{field_pattern}', 3), ''), ','), x -> CASE WHEN x = '.' OR x = '' THEN NULL ELSE x END) AS {field_sql_type}[]) AS '{prefix}{field_to_rename}' """
12102                        )
12103
12104                    # Field is a unique value
12105                    else:
12106                        fields_columns.append(
12107                            f""" NULLIF(regexp_replace(regexp_extract("INFO", '{field_pattern}', 3), '^\\.$', ''), '')::{field_sql_type} AS '{prefix}{field_to_rename}' """
12108                        )
12109
12110            else:
12111                fields_columns.append(f""" null AS '{prefix}{field_to_rename}' """)
12112                msg_err = f"Field '{field}' is not found (in table or header): '{field}' will be set to NULL"
12113                log.warning(msg=msg_err)
12114
12115        # Limit
12116        limit_clause = ""
12117        if limit is not None:
12118            limit_clause = f" LIMIT {limit} "
12119
12120        # Query select
12121        query_select = f"""
12122            SELECT
12123                {', '.join([f'"{field}"' for field in fields_needed])}, {", ".join(fields_columns)}
12124            FROM
12125                {table}
12126            {limit_clause}
12127        """
12128
12129        # Drop if any
12130        if drop_view:
12131            log.debug(f"Drop view: {view}")
12132            query_create_view = f"""
12133                DROP {view_type} IF EXISTS {view}
12134            """
12135            self.execute_query(query=query_create_view)
12136            log.debug(f"View dropped: {view}")
12137
12138        # Create view
12139        log.debug(f"Create view: {view}")
12140        query_create_view = f"""
12141            CREATE {view_type} IF NOT EXISTS {view} AS {query_select}
12142        """
12143        # log.debug(f"query_create_view:{query_create_view}")
12144        self.execute_query(query=query_create_view)
12145        log.debug(f"View created: {view}")
12146
12147        return view
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
39    def __init__(
40        self,
41        conn=None,
42        input: str = None,
43        output: str = None,
44        config: dict = {},
45        param: dict = {},
46        load: bool = False,
47    ) -> None:
48        """
49        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
50        header
51
52        :param conn: the connection to the database
53        :param input: the input file
54        :param output: the output file
55        :param config: a dictionary containing the configuration of the model
56        :param param: a dictionary containing the parameters of the model
57        """
58
59        # Init variables
60        self.init_variables()
61
62        # Input
63        self.set_input(input)
64
65        # Config
66        self.set_config(config)
67
68        # Param
69        self.set_param(param)
70
71        # Output
72        self.set_output(output)
73
74        # connexion
75        self.set_connexion(conn)
76
77        # Header
78        self.set_header()
79
80        # Samples
81        self.set_samples()
82
83        # Load data
84        if load:
85            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 87    def set_samples(self, samples: list = None) -> list:
 88        """
 89        The function `set_samples` sets the samples attribute of an object to a provided list or
 90        retrieves it from a parameter dictionary.
 91
 92        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 93        input and sets the `samples` attribute of the class to the provided list. If no samples are
 94        provided, it tries to get the samples from the class's parameters using the `get_param` method
 95        :type samples: list
 96        :return: The `samples` list is being returned.
 97        """
 98
 99        if not samples:
100            samples = self.get_param().get("samples", {}).get("list", None)
101
102        self.samples = samples
103
104        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
106    def get_samples(self) -> list:
107        """
108        This function returns a list of samples.
109        :return: The `get_samples` method is returning the `samples` attribute of the object.
110        """
111
112        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
114    def get_samples_check(self) -> bool:
115        """
116        This function returns the value of the "check" key within the "samples" dictionary retrieved
117        from the parameters.
118        :return: The method `get_samples_check` is returning the value of the key "check" inside the
119        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
120        method. If the key "check" is not found, it will return `False`.
121        """
122
123        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
125    def set_input(self, input: str = None) -> None:
126        """
127        The function `set_input` takes a file name as input, extracts the name and extension, and sets
128        attributes in the class accordingly.
129
130        :param input: The `set_input` method in the provided code snippet is used to set attributes
131        related to the input file. Here's a breakdown of the parameters and their usage in the method:
132        :type input: str
133        """
134
135        if input and not isinstance(input, str):
136            try:
137                self.input = input.name
138            except:
139                log.error(f"Input file '{input} in bad format")
140                raise ValueError(f"Input file '{input} in bad format")
141        else:
142            self.input = input
143
144        # Input format
145        if input:
146            input_name, input_extension = os.path.splitext(self.input)
147            self.input_name = input_name
148            self.input_extension = input_extension
149            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
151    def set_config(self, config: dict) -> None:
152        """
153        The set_config function takes a config object and assigns it as the configuration object for the
154        class.
155
156        :param config: The `config` parameter in the `set_config` function is a dictionary object that
157        contains configuration settings for the class. When you call the `set_config` function with a
158        dictionary object as the argument, it will set that dictionary as the configuration object for
159        the class
160        :type config: dict
161        """
162
163        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
165    def set_param(self, param: dict) -> None:
166        """
167        This function sets a parameter object for the class based on the input dictionary.
168
169        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
170        as the `param` attribute of the class instance
171        :type param: dict
172        """
173
174        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
176    def init_variables(self) -> None:
177        """
178        This function initializes the variables that will be used in the rest of the class
179        """
180
181        self.prefix = "howard"
182        self.table_variants = "variants"
183        self.dataframe = None
184
185        self.comparison_map = {
186            "gt": ">",
187            "gte": ">=",
188            "lt": "<",
189            "lte": "<=",
190            "equals": "=",
191            "contains": "SIMILAR TO",
192        }
193
194        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
195
196        self.code_type_map_to_sql = {
197            "Integer": "INTEGER",
198            "String": "VARCHAR",
199            "Float": "FLOAT",
200            "Flag": "VARCHAR",
201        }
202
203        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
205    def get_indexing(self) -> bool:
206        """
207        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
208        returns False.
209        :return: The value of the indexing parameter.
210        """
211
212        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
214    def get_connexion_config(self) -> dict:
215        """
216        The function `get_connexion_config` returns a dictionary containing the configuration for a
217        connection, including the number of threads and memory limit.
218        :return: a dictionary containing the configuration for the Connexion library.
219        """
220
221        # config
222        config = self.get_config()
223
224        # Connexion config
225        connexion_config = {}
226        threads = self.get_threads()
227
228        # Threads
229        if threads:
230            connexion_config["threads"] = threads
231
232        # Memory
233        # if config.get("memory", None):
234        #     connexion_config["memory_limit"] = config.get("memory")
235        if self.get_memory():
236            connexion_config["memory_limit"] = self.get_memory()
237
238        # Temporary directory
239        if config.get("tmp", None):
240            connexion_config["temp_directory"] = config.get("tmp")
241
242        # Access
243        if config.get("access", None):
244            access = config.get("access")
245            if access in ["RO"]:
246                access = "READ_ONLY"
247            elif access in ["RW"]:
248                access = "READ_WRITE"
249            connexion_db = self.get_connexion_db()
250            if connexion_db in ":memory:":
251                access = "READ_WRITE"
252            connexion_config["access_mode"] = access
253
254        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
256    def get_duckdb_settings(self) -> dict:
257        """
258        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
259        string.
260        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
261        """
262
263        # config
264        config = self.get_config()
265
266        # duckdb settings
267        duckdb_settings_dict = {}
268        if config.get("duckdb_settings", None):
269            duckdb_settings = config.get("duckdb_settings")
270            duckdb_settings = full_path(duckdb_settings)
271            # duckdb setting is a file
272            if os.path.exists(duckdb_settings):
273                with open(duckdb_settings) as json_file:
274                    duckdb_settings_dict = yaml.safe_load(json_file)
275            # duckdb settings is a string
276            else:
277                duckdb_settings_dict = json.loads(duckdb_settings)
278
279        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
281    def set_connexion_db(self) -> str:
282        """
283        The function `set_connexion_db` returns the appropriate database connection string based on the
284        input format and connection type.
285        :return: the value of the variable `connexion_db`.
286        """
287
288        # Default connexion db
289        default_connexion_db = ":memory:"
290
291        # Find connexion db
292        if self.get_input_format() in ["db", "duckdb"]:
293            connexion_db = self.get_input()
294        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
295            connexion_db = default_connexion_db
296        elif self.get_connexion_type() in ["tmpfile"]:
297            tmp_name = tempfile.mkdtemp(
298                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
299            )
300            connexion_db = f"{tmp_name}/tmp.db"
301        elif self.get_connexion_type() != "":
302            connexion_db = self.get_connexion_type()
303        else:
304            connexion_db = default_connexion_db
305
306        # Set connexion db
307        self.connexion_db = connexion_db
308
309        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
311    def set_connexion(self, conn) -> None:
312        """
313        The function `set_connexion` creates a connection to a database, with options for different
314        database formats and settings.
315
316        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
317        database. If a connection is not provided, a new connection to an in-memory database is created.
318        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
319        sqlite
320        """
321
322        # Connexion db
323        connexion_db = self.set_connexion_db()
324
325        # Connexion config
326        connexion_config = self.get_connexion_config()
327
328        # Connexion format
329        connexion_format = self.get_config().get("connexion_format", "duckdb")
330        # Set connexion format
331        self.connexion_format = connexion_format
332
333        # Connexion
334        if not conn:
335            if connexion_format in ["duckdb"]:
336                conn = duckdb.connect(connexion_db, config=connexion_config)
337                # duckDB settings
338                duckdb_settings = self.get_duckdb_settings()
339                if duckdb_settings:
340                    for setting in duckdb_settings:
341                        setting_value = duckdb_settings.get(setting)
342                        if isinstance(setting_value, str):
343                            setting_value = f"'{setting_value}'"
344                        conn.execute(f"PRAGMA {setting}={setting_value};")
345            elif connexion_format in ["sqlite"]:
346                conn = sqlite3.connect(connexion_db)
347
348        # Set connexion
349        self.conn = conn
350
351        # Log
352        log.debug(f"connexion_format: {connexion_format}")
353        log.debug(f"connexion_db: {connexion_db}")
354        log.debug(f"connexion config: {connexion_config}")
355        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
357    def set_output(self, output: str = None) -> None:
358        """
359        The `set_output` function in Python sets the output file based on the input or a specified key
360        in the config file, extracting the output name, extension, and format.
361
362        :param output: The `output` parameter in the `set_output` method is used to specify the name of
363        the output file. If the config file has an 'output' key, the method sets the output to the value
364        of that key. If no output is provided, it sets the output to `None`
365        :type output: str
366        """
367
368        if output and not isinstance(output, str):
369            self.output = output.name
370        else:
371            self.output = output
372
373        # Output format
374        if self.output:
375            output_name, output_extension = os.path.splitext(self.output)
376            self.output_name = output_name
377            self.output_extension = output_extension
378            self.output_format = self.output_extension.replace(".", "")
379        else:
380            self.output_name = None
381            self.output_extension = None
382            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
384    def set_header(self) -> None:
385        """
386        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
387        """
388
389        input_file = self.get_input()
390        default_header_list = [
391            "##fileformat=VCFv4.2",
392            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
393        ]
394
395        # Full path
396        input_file = full_path(input_file)
397
398        if input_file:
399
400            input_format = self.get_input_format()
401            input_compressed = self.get_input_compressed()
402            config = self.get_config()
403            header_list = default_header_list
404            if input_format in [
405                "vcf",
406                "hdr",
407                "tsv",
408                "csv",
409                "psv",
410                "parquet",
411                "db",
412                "duckdb",
413            ]:
414                # header provided in param
415                if config.get("header_file", None):
416                    with open(config.get("header_file"), "rt") as f:
417                        header_list = self.read_vcf_header(f)
418                # within a vcf file format (header within input file itsself)
419                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
420                    # within a compressed vcf file format (.vcf.gz)
421                    if input_compressed:
422                        with bgzf.open(input_file, "rt") as f:
423                            header_list = self.read_vcf_header(f)
424                    # within an uncompressed vcf file format (.vcf)
425                    else:
426                        with open(input_file, "rt") as f:
427                            header_list = self.read_vcf_header(f)
428                # header provided in default external file .hdr
429                elif os.path.exists((input_file + ".hdr")):
430                    with open(input_file + ".hdr", "rt") as f:
431                        header_list = self.read_vcf_header(f)
432                else:
433                    try:  # Try to get header info fields and file columns
434
435                        with tempfile.TemporaryDirectory() as tmpdir:
436
437                            # Create database
438                            db_for_header = Database(database=input_file)
439
440                            # Get header columns for infos fields
441                            db_header_from_columns = (
442                                db_for_header.get_header_from_columns()
443                            )
444
445                            # Get real columns in the file
446                            db_header_columns = db_for_header.get_columns()
447
448                            # Write header file
449                            header_file_tmp = os.path.join(tmpdir, "header")
450                            f = open(header_file_tmp, "w")
451                            vcf.Writer(f, db_header_from_columns)
452                            f.close()
453
454                            # Replace #CHROM line with rel columns
455                            header_list = db_for_header.read_header_file(
456                                header_file=header_file_tmp
457                            )
458                            header_list[-1] = "\t".join(db_header_columns)
459
460                    except:
461
462                        log.warning(
463                            f"No header for file {input_file}. Set as default VCF header"
464                        )
465                        header_list = default_header_list
466
467            else:  # try for unknown format ?
468
469                log.error(f"Input file format '{input_format}' not available")
470                raise ValueError(f"Input file format '{input_format}' not available")
471
472            if not header_list:
473                header_list = default_header_list
474
475            # header as list
476            self.header_list = header_list
477
478            # header as VCF object
479            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
480
481        else:
482
483            self.header_list = None
484            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
486    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
487        """
488        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
489        DataFrame based on the connection format.
490
491        :param query: The `query` parameter in the `get_query_to_df` function is a string that
492        represents the SQL query you want to execute. This query will be used to fetch data from a
493        database and convert it into a pandas DataFrame
494        :type query: str
495        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
496        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
497        function will only fetch up to that number of rows from the database query result. If no limit
498        is specified,
499        :type limit: int
500        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
501        """
502
503        # Connexion format
504        connexion_format = self.get_connexion_format()
505
506        # Limit in query
507        if limit:
508            pd.set_option("display.max_rows", limit)
509            if connexion_format in ["duckdb"]:
510                df = (
511                    self.conn.execute(query)
512                    .fetch_record_batch(limit)
513                    .read_next_batch()
514                    .to_pandas()
515                )
516            elif connexion_format in ["sqlite"]:
517                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
518
519        # Full query
520        else:
521            if connexion_format in ["duckdb"]:
522                df = self.conn.execute(query).df()
523            elif connexion_format in ["sqlite"]:
524                df = pd.read_sql_query(query, self.conn)
525
526        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
528    def get_overview(self) -> None:
529        """
530        The function prints the input, output, config, and dataframe of the current object
531        """
532        table_variants_from = self.get_table_variants(clause="from")
533        sql_columns = self.get_header_columns_as_sql()
534        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
535        df = self.get_query_to_df(sql_query_export)
536        log.info(
537            "Input:  "
538            + str(self.get_input())
539            + " ["
540            + str(str(self.get_input_format()))
541            + "]"
542        )
543        log.info(
544            "Output: "
545            + str(self.get_output())
546            + " ["
547            + str(str(self.get_output_format()))
548            + "]"
549        )
550        log.info("Config: ")
551        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
552            "\n"
553        ):
554            log.info("\t" + str(d))
555        log.info("Param: ")
556        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
557            "\n"
558        ):
559            log.info("\t" + str(d))
560        log.info("Sample list: " + str(self.get_header_sample_list()))
561        log.info("Dataframe: ")
562        for d in str(df).split("\n"):
563            log.info("\t" + str(d))
564
565        # garbage collector
566        del df
567        gc.collect()
568
569        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
571    def get_stats(self) -> dict:
572        """
573        The `get_stats` function calculates and returns various statistics of the current object,
574        including information about the input file, variants, samples, header fields, quality, and
575        SNVs/InDels.
576        :return: a dictionary containing various statistics of the current object. The dictionary has
577        the following structure:
578        """
579
580        # Log
581        log.info(f"Stats Calculation...")
582
583        # table varaints
584        table_variants_from = self.get_table_variants()
585
586        # stats dict
587        stats = {"Infos": {}}
588
589        ### File
590        input_file = self.get_input()
591        stats["Infos"]["Input file"] = input_file
592
593        # Header
594        header_infos = self.get_header().infos
595        header_formats = self.get_header().formats
596        header_infos_list = list(header_infos)
597        header_formats_list = list(header_formats)
598
599        ### Variants
600
601        stats["Variants"] = {}
602
603        # Variants by chr
604        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
605        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
606        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
607            by=["CHROM"], kind="quicksort"
608        )
609
610        # Total number of variants
611        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
612
613        # Calculate percentage
614        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
615            lambda x: (x / nb_of_variants)
616        )
617
618        stats["Variants"]["Number of variants by chromosome"] = (
619            nb_of_variants_by_chrom.to_dict(orient="index")
620        )
621
622        stats["Infos"]["Number of variants"] = int(nb_of_variants)
623
624        ### Samples
625
626        # Init
627        samples = {}
628        nb_of_samples = 0
629
630        # Check Samples
631        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
632            log.debug(f"Check samples...")
633            for sample in self.get_header_sample_list():
634                sql_query_samples = f"""
635                    SELECT  '{sample}' as sample,
636                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
637                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
638                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
639                    FROM {table_variants_from}
640                    WHERE (
641                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
642                        AND
643                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
644                      )
645                    GROUP BY genotype
646                    """
647                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
648                sample_genotype_count = sql_query_genotype_df["count"].sum()
649                if len(sql_query_genotype_df):
650                    nb_of_samples += 1
651                    samples[f"{sample} - {sample_genotype_count} variants"] = (
652                        sql_query_genotype_df.to_dict(orient="index")
653                    )
654
655            stats["Samples"] = samples
656            stats["Infos"]["Number of samples"] = nb_of_samples
657
658        # #
659        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
660        #     stats["Infos"]["Number of samples"] = nb_of_samples
661        # elif nb_of_samples:
662        #     stats["Infos"]["Number of samples"] = "not a VCF format"
663
664        ### INFO and FORMAT fields
665        header_types_df = {}
666        header_types_list = {
667            "List of INFO fields": header_infos,
668            "List of FORMAT fields": header_formats,
669        }
670        i = 0
671        for header_type in header_types_list:
672
673            header_type_infos = header_types_list.get(header_type)
674            header_infos_dict = {}
675
676            for info in header_type_infos:
677
678                i += 1
679                header_infos_dict[i] = {}
680
681                # ID
682                header_infos_dict[i]["id"] = info
683
684                # num
685                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
686                if header_type_infos[info].num in genotype_map.keys():
687                    header_infos_dict[i]["Number"] = genotype_map.get(
688                        header_type_infos[info].num
689                    )
690                else:
691                    header_infos_dict[i]["Number"] = header_type_infos[info].num
692
693                # type
694                if header_type_infos[info].type:
695                    header_infos_dict[i]["Type"] = header_type_infos[info].type
696                else:
697                    header_infos_dict[i]["Type"] = "."
698
699                # desc
700                if header_type_infos[info].desc != None:
701                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
702                else:
703                    header_infos_dict[i]["Description"] = ""
704
705            if len(header_infos_dict):
706                header_types_df[header_type] = pd.DataFrame.from_dict(
707                    header_infos_dict, orient="index"
708                ).to_dict(orient="index")
709
710        # Stats
711        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
712        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
713        stats["Header"] = header_types_df
714
715        ### QUAL
716        if "QUAL" in self.get_header_columns():
717            sql_query_qual = f"""
718                    SELECT
719                        avg(CAST(QUAL AS INTEGER)) AS Average,
720                        min(CAST(QUAL AS INTEGER)) AS Minimum,
721                        max(CAST(QUAL AS INTEGER)) AS Maximum,
722                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
723                        median(CAST(QUAL AS INTEGER)) AS Median,
724                        variance(CAST(QUAL AS INTEGER)) AS Variance
725                    FROM {table_variants_from}
726                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
727                    """
728
729            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
730            stats["Quality"] = {"Stats": qual}
731
732        ### SNV and InDel
733
734        sql_query_snv = f"""
735            
736            SELECT Type, count FROM (
737
738                    SELECT
739                        'Total' AS Type,
740                        count(*) AS count
741                    FROM {table_variants_from}
742
743                    UNION
744
745                    SELECT
746                        'MNV' AS Type,
747                        count(*) AS count
748                    FROM {table_variants_from}
749                    WHERE len(REF) > 1 AND len(ALT) > 1
750                    AND len(REF) = len(ALT)
751
752                    UNION
753
754                    SELECT
755                        'InDel' AS Type,
756                        count(*) AS count
757                    FROM {table_variants_from}
758                    WHERE len(REF) > 1 OR len(ALT) > 1
759                    AND len(REF) != len(ALT)
760                    
761                    UNION
762
763                    SELECT
764                        'SNV' AS Type,
765                        count(*) AS count
766                    FROM {table_variants_from}
767                    WHERE len(REF) = 1 AND len(ALT) = 1
768
769                )
770
771            ORDER BY count DESC
772
773                """
774        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
775
776        sql_query_snv_substitution = f"""
777                SELECT
778                    concat(REF, '>', ALT) AS 'Substitution',
779                    count(*) AS count
780                FROM {table_variants_from}
781                WHERE len(REF) = 1 AND len(ALT) = 1
782                GROUP BY REF, ALT
783                ORDER BY count(*) DESC
784                """
785        snv_substitution = (
786            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
787        )
788        stats["Variants"]["Counts"] = snv_indel
789        stats["Variants"]["Substitutions"] = snv_substitution
790
791        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
793    def stats_to_file(self, file: str = None) -> str:
794        """
795        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
796        into a JSON object, and writes the JSON object to the specified file.
797
798        :param file: The `file` parameter is a string that represents the file path where the JSON data
799        will be written
800        :type file: str
801        :return: the name of the file that was written to.
802        """
803
804        # Get stats
805        stats = self.get_stats()
806
807        # Serializing json
808        json_object = json.dumps(stats, indent=4)
809
810        # Writing to sample.json
811        with open(file, "w") as outfile:
812            outfile.write(json_object)
813
814        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
816    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
817        """
818        The `print_stats` function generates a markdown file and prints the statistics contained in a
819        JSON file in a formatted manner.
820
821        :param output_file: The `output_file` parameter is a string that specifies the path and filename
822        of the output file where the stats will be printed in Markdown format. If no `output_file` is
823        provided, a temporary directory will be created and the stats will be saved in a file named
824        "stats.md" within that
825        :type output_file: str
826        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
827        file where the statistics will be saved. If no value is provided, a temporary directory will be
828        created and a default file name "stats.json" will be used
829        :type json_file: str
830        :return: The function `print_stats` does not return any value. It has a return type annotation
831        of `None`.
832        """
833
834        # Full path
835        output_file = full_path(output_file)
836        json_file = full_path(json_file)
837
838        with tempfile.TemporaryDirectory() as tmpdir:
839
840            # Files
841            if not output_file:
842                output_file = os.path.join(tmpdir, "stats.md")
843            if not json_file:
844                json_file = os.path.join(tmpdir, "stats.json")
845
846            # Create folders
847            if not os.path.exists(os.path.dirname(output_file)):
848                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
849            if not os.path.exists(os.path.dirname(json_file)):
850                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
851
852            # Create stats JSON file
853            stats_file = self.stats_to_file(file=json_file)
854
855            # Print stats file
856            with open(stats_file) as f:
857                stats = yaml.safe_load(f)
858
859            # Output
860            output_title = []
861            output_index = []
862            output = []
863
864            # Title
865            output_title.append("# HOWARD Stats")
866
867            # Index
868            output_index.append("## Index")
869
870            # Process sections
871            for section in stats:
872                infos = stats.get(section)
873                section_link = "#" + section.lower().replace(" ", "-")
874                output.append(f"## {section}")
875                output_index.append(f"- [{section}]({section_link})")
876
877                if len(infos):
878                    for info in infos:
879                        try:
880                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
881                            is_df = True
882                        except:
883                            try:
884                                df = pd.DataFrame.from_dict(
885                                    json.loads((infos.get(info))), orient="index"
886                                )
887                                is_df = True
888                            except:
889                                is_df = False
890                        if is_df:
891                            output.append(f"### {info}")
892                            info_link = "#" + info.lower().replace(" ", "-")
893                            output_index.append(f"   - [{info}]({info_link})")
894                            output.append(f"{df.to_markdown(index=False)}")
895                        else:
896                            output.append(f"- {info}: {infos.get(info)}")
897                else:
898                    output.append(f"NA")
899
900            # Write stats in markdown file
901            with open(output_file, "w") as fp:
902                for item in output_title:
903                    fp.write("%s\n" % item)
904                for item in output_index:
905                    fp.write("%s\n" % item)
906                for item in output:
907                    fp.write("%s\n" % item)
908
909            # Output stats in markdown
910            print("")
911            print("\n\n".join(output_title))
912            print("")
913            print("\n\n".join(output))
914            print("")
915
916        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
918    def get_input(self) -> str:
919        """
920        It returns the value of the input variable.
921        :return: The input is being returned.
922        """
923        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
925    def get_input_format(self, input_file: str = None) -> str:
926        """
927        This function returns the format of the input variable, either from the provided input file or
928        by prompting for input.
929
930        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
931        represents the file path of the input file. If no `input_file` is provided when calling the
932        method, it will default to `None`
933        :type input_file: str
934        :return: The format of the input variable is being returned.
935        """
936
937        if not input_file:
938            input_file = self.get_input()
939        input_format = get_file_format(input_file)
940        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
942    def get_input_compressed(self, input_file: str = None) -> str:
943        """
944        The function `get_input_compressed` returns the format of the input variable after compressing
945        it.
946
947        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
948        that represents the file path of the input file. If no `input_file` is provided when calling the
949        method, it will default to `None` and the method will then call `self.get_input()` to
950        :type input_file: str
951        :return: The function `get_input_compressed` returns the compressed format of the input
952        variable.
953        """
954
955        if not input_file:
956            input_file = self.get_input()
957        input_compressed = get_file_compressed(input_file)
958        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
960    def get_output(self) -> str:
961        """
962        It returns the output of the neuron.
963        :return: The output of the neural network.
964        """
965
966        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
968    def get_output_format(self, output_file: str = None) -> str:
969        """
970        The function `get_output_format` returns the format of the input variable or the output file if
971        provided.
972
973        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
974        that represents the file path of the output file. If no `output_file` is provided when calling
975        the method, it will default to the output obtained from the `get_output` method of the class
976        instance. The
977        :type output_file: str
978        :return: The format of the input variable is being returned.
979        """
980
981        if not output_file:
982            output_file = self.get_output()
983        output_format = get_file_format(output_file)
984
985        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
987    def get_config(self) -> dict:
988        """
989        It returns the config
990        :return: The config variable is being returned.
991        """
992        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
994    def get_param(self) -> dict:
995        """
996        It returns the param
997        :return: The param variable is being returned.
998        """
999        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
1001    def get_connexion_db(self) -> str:
1002        """
1003        It returns the connexion_db attribute of the object
1004        :return: The connexion_db is being returned.
1005        """
1006        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1008    def get_prefix(self) -> str:
1009        """
1010        It returns the prefix of the object.
1011        :return: The prefix is being returned.
1012        """
1013        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1015    def get_table_variants(self, clause: str = "select") -> str:
1016        """
1017        This function returns the table_variants attribute of the object
1018
1019        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1020        defaults to select (optional)
1021        :return: The table_variants attribute of the object.
1022        """
1023
1024        # Access
1025        access = self.get_config().get("access", None)
1026
1027        # Clauses "select", "where", "update"
1028        if clause in ["select", "where", "update"]:
1029            table_variants = self.table_variants
1030        # Clause "from"
1031        elif clause in ["from"]:
1032            # For Read Only
1033            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1034                input_file = self.get_input()
1035                table_variants = f"'{input_file}' as variants"
1036            # For Read Write
1037            else:
1038                table_variants = f"{self.table_variants} as variants"
1039        else:
1040            table_variants = self.table_variants
1041        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1043    def get_tmp_dir(self) -> str:
1044        """
1045        The function `get_tmp_dir` returns the temporary directory path based on configuration
1046        parameters or a default path.
1047        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1048        configuration, parameters, and a default value of "/tmp".
1049        """
1050
1051        return get_tmp(
1052            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1053        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1055    def get_connexion_type(self) -> str:
1056        """
1057        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1058
1059        :return: The connexion type is being returned.
1060        """
1061        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1063    def get_connexion(self):
1064        """
1065        It returns the connection object
1066
1067        :return: The connection object.
1068        """
1069        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1071    def close_connexion(self) -> None:
1072        """
1073        This function closes the connection to the database.
1074        :return: The connection is being closed.
1075        """
1076        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1078    def get_header(self, type: str = "vcf"):
1079        """
1080        This function returns the header of the VCF file as a list of strings
1081
1082        :param type: the type of header you want to get, defaults to vcf (optional)
1083        :return: The header of the vcf file.
1084        """
1085
1086        if self.header_vcf:
1087            if type == "vcf":
1088                return self.header_vcf
1089            elif type == "list":
1090                return self.header_list
1091        else:
1092            if type == "vcf":
1093                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1094                return header
1095            elif type == "list":
1096                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_infos_list(self) -> list:
1098    def get_header_infos_list(self) -> list:
1099        """
1100        This function retrieves a list of information fields from the header.
1101        :return: A list of information fields from the header.
1102        """
1103
1104        # Init
1105        infos_list = []
1106
1107        for field in self.get_header().infos:
1108            infos_list.append(field)
1109
1110        return infos_list

This function retrieves a list of information fields from the header.

Returns

A list of information fields from the header.

def get_header_length(self, file: str = None) -> int:
1112    def get_header_length(self, file: str = None) -> int:
1113        """
1114        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1115        line.
1116
1117        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1118        header file. If this argument is provided, the function will read the header from the specified
1119        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1120        :type file: str
1121        :return: the length of the header list, excluding the #CHROM line.
1122        """
1123
1124        if file:
1125            return len(self.read_vcf_header_file(file=file)) - 1
1126        elif self.get_header(type="list"):
1127            return len(self.get_header(type="list")) - 1
1128        else:
1129            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1131    def get_header_columns(self) -> str:
1132        """
1133        This function returns the header list of a VCF
1134
1135        :return: The length of the header list.
1136        """
1137        if self.get_header():
1138            return self.get_header(type="list")[-1]
1139        else:
1140            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1142    def get_header_columns_as_list(self) -> list:
1143        """
1144        This function returns the header list of a VCF
1145
1146        :return: The length of the header list.
1147        """
1148        if self.get_header():
1149            return self.get_header_columns().strip().split("\t")
1150        else:
1151            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1153    def get_header_columns_as_sql(self) -> str:
1154        """
1155        This function retruns header length (without #CHROM line)
1156
1157        :return: The length of the header list.
1158        """
1159        sql_column_list = []
1160        for col in self.get_header_columns_as_list():
1161            sql_column_list.append(f'"{col}"')
1162        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1164    def get_header_sample_list(
1165        self, check: bool = False, samples: list = None, samples_force: bool = False
1166    ) -> list:
1167        """
1168        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1169        checking and filtering based on input parameters.
1170
1171        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1172        parameter that determines whether to check if the samples in the list are properly defined as
1173        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1174        list is defined as a, defaults to False
1175        :type check: bool (optional)
1176        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1177        allows you to specify a subset of samples from the header. If you provide a list of sample
1178        names, the function will check if each sample is defined in the header. If a sample is not found
1179        in the
1180        :type samples: list
1181        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1182        a boolean parameter that determines whether to force the function to return the sample list
1183        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1184        function will return the sample list without performing, defaults to False
1185        :type samples_force: bool (optional)
1186        :return: The function `get_header_sample_list` returns a list of samples based on the input
1187        parameters and conditions specified in the function.
1188        """
1189
1190        # Init
1191        samples_list = []
1192
1193        if samples is None:
1194            samples_list = self.header_vcf.samples
1195        else:
1196            samples_checked = []
1197            for sample in samples:
1198                if sample in self.header_vcf.samples:
1199                    samples_checked.append(sample)
1200                else:
1201                    log.warning(f"Sample '{sample}' not defined in header")
1202            samples_list = samples_checked
1203
1204            # Force sample list without checking if is_genotype_column
1205            if samples_force:
1206                log.warning(f"Samples {samples_list} not checked if genotypes")
1207                return samples_list
1208
1209        if check:
1210            samples_checked = []
1211            for sample in samples_list:
1212                if self.is_genotype_column(column=sample):
1213                    samples_checked.append(sample)
1214                else:
1215                    log.warning(
1216                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1217                    )
1218            samples_list = samples_checked
1219
1220        # Return samples list
1221        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1223    def is_genotype_column(self, column: str = None) -> bool:
1224        """
1225        This function checks if a given column is a genotype column in a database.
1226
1227        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1228        represents the column name in a database table. This method checks if the specified column is a
1229        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1230        method of
1231        :type column: str
1232        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1233        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1234        column name and returns the result. If the `column` parameter is None, it returns False.
1235        """
1236
1237        if column is not None:
1238            return Database(database=self.get_input()).is_genotype_column(column=column)
1239        else:
1240            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1242    def get_verbose(self) -> bool:
1243        """
1244        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1245        exist
1246
1247        :return: The value of the key "verbose" in the config dictionary.
1248        """
1249        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1251    def get_connexion_format(self) -> str:
1252        """
1253        It returns the connexion format of the object.
1254        :return: The connexion_format is being returned.
1255        """
1256        connexion_format = self.connexion_format
1257        if connexion_format not in ["duckdb", "sqlite"]:
1258            log.error(f"Unknown connexion format {connexion_format}")
1259            raise ValueError(f"Unknown connexion format {connexion_format}")
1260        else:
1261            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1263    def insert_file_to_table(
1264        self,
1265        file,
1266        columns: str,
1267        header_len: int = 0,
1268        sep: str = "\t",
1269        chunksize: int = 1000000,
1270    ) -> None:
1271        """
1272        The function reads a file in chunks and inserts each chunk into a table based on the specified
1273        database format.
1274
1275        :param file: The `file` parameter is the file that you want to load into a table. It should be
1276        the path to the file on your system
1277        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1278        should contain the names of the columns in the table where the data will be inserted. The column
1279        names should be separated by commas within the string. For example, if you have columns named
1280        "id", "name
1281        :type columns: str
1282        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1283        the number of lines to skip at the beginning of the file before reading the actual data. This
1284        parameter allows you to skip any header information present in the file before processing the
1285        data, defaults to 0
1286        :type header_len: int (optional)
1287        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1288        separator character that is used in the file being read. In this case, the default separator is
1289        set to `\t`, which represents a tab character. You can change this parameter to a different
1290        separator character if, defaults to \t
1291        :type sep: str (optional)
1292        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1293        when processing the file in chunks. In the provided code snippet, the default value for
1294        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1295        to 1000000
1296        :type chunksize: int (optional)
1297        """
1298
1299        # Config
1300        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1301        connexion_format = self.get_connexion_format()
1302
1303        log.debug("chunksize: " + str(chunksize))
1304
1305        if chunksize:
1306            for chunk in pd.read_csv(
1307                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1308            ):
1309                if connexion_format in ["duckdb"]:
1310                    sql_insert_into = (
1311                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1312                    )
1313                    self.conn.execute(sql_insert_into)
1314                elif connexion_format in ["sqlite"]:
1315                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1317    def load_data(
1318        self,
1319        input_file: str = None,
1320        drop_variants_table: bool = False,
1321        sample_size: int = 20480,
1322    ) -> None:
1323        """
1324        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1325        table before loading the data and specify a sample size.
1326
1327        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1328        table
1329        :type input_file: str
1330        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1331        determines whether the variants table should be dropped before loading the data. If set to
1332        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1333        not be dropped, defaults to False
1334        :type drop_variants_table: bool (optional)
1335        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1336        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1337        20480
1338        :type sample_size: int (optional)
1339        """
1340
1341        log.info("Loading...")
1342
1343        # change input file
1344        if input_file:
1345            self.set_input(input_file)
1346            self.set_header()
1347
1348        # drop variants table
1349        if drop_variants_table:
1350            self.drop_variants_table()
1351
1352        # get table variants
1353        table_variants = self.get_table_variants()
1354
1355        # Access
1356        access = self.get_config().get("access", None)
1357        log.debug(f"access: {access}")
1358
1359        # Input format and compress
1360        input_format = self.get_input_format()
1361        input_compressed = self.get_input_compressed()
1362        log.debug(f"input_format: {input_format}")
1363        log.debug(f"input_compressed: {input_compressed}")
1364
1365        # input_compressed_format
1366        if input_compressed:
1367            input_compressed_format = "gzip"
1368        else:
1369            input_compressed_format = "none"
1370        log.debug(f"input_compressed_format: {input_compressed_format}")
1371
1372        # Connexion format
1373        connexion_format = self.get_connexion_format()
1374
1375        # Sample size
1376        if not sample_size:
1377            sample_size = -1
1378        log.debug(f"sample_size: {sample_size}")
1379
1380        # Load data
1381        log.debug(f"Load Data from {input_format}")
1382
1383        # DuckDB connexion
1384        if connexion_format in ["duckdb"]:
1385
1386            # Database already exists
1387            if self.input_format in ["db", "duckdb"]:
1388
1389                if connexion_format in ["duckdb"]:
1390                    log.debug(f"Input file format '{self.input_format}' duckDB")
1391                else:
1392                    log.error(
1393                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1394                    )
1395                    raise ValueError(
1396                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1397                    )
1398
1399            # Load from existing database format
1400            else:
1401
1402                try:
1403                    # Create Table or View
1404                    database = Database(database=self.input)
1405                    sql_from = database.get_sql_from(sample_size=sample_size)
1406
1407                    if access in ["RO"]:
1408                        sql_load = (
1409                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1410                        )
1411                    else:
1412                        sql_load = (
1413                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1414                        )
1415                    self.conn.execute(sql_load)
1416
1417                except:
1418                    # Format not available
1419                    log.error(f"Input file format '{self.input_format}' not available")
1420                    raise ValueError(
1421                        f"Input file format '{self.input_format}' not available"
1422                    )
1423
1424        # SQLite connexion
1425        elif connexion_format in ["sqlite"] and input_format in [
1426            "vcf",
1427            "tsv",
1428            "csv",
1429            "psv",
1430        ]:
1431
1432            # Main structure
1433            structure = {
1434                "#CHROM": "VARCHAR",
1435                "POS": "INTEGER",
1436                "ID": "VARCHAR",
1437                "REF": "VARCHAR",
1438                "ALT": "VARCHAR",
1439                "QUAL": "VARCHAR",
1440                "FILTER": "VARCHAR",
1441                "INFO": "VARCHAR",
1442            }
1443
1444            # Strcuture with samples
1445            structure_complete = structure
1446            if self.get_header_sample_list():
1447                structure["FORMAT"] = "VARCHAR"
1448                for sample in self.get_header_sample_list():
1449                    structure_complete[sample] = "VARCHAR"
1450
1451            # Columns list for create and insert
1452            sql_create_table_columns = []
1453            sql_create_table_columns_list = []
1454            for column in structure_complete:
1455                column_type = structure_complete[column]
1456                sql_create_table_columns.append(
1457                    f'"{column}" {column_type} default NULL'
1458                )
1459                sql_create_table_columns_list.append(f'"{column}"')
1460
1461            # Create database
1462            log.debug(f"Create Table {table_variants}")
1463            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1464            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1465            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1466            self.conn.execute(sql_create_table)
1467
1468            # chunksize define length of file chunk load file
1469            chunksize = 100000
1470
1471            # delimiter
1472            delimiter = file_format_delimiters.get(input_format, "\t")
1473
1474            # Load the input file
1475            with open(self.input, "rt") as input_file:
1476
1477                # Use the appropriate file handler based on the input format
1478                if input_compressed:
1479                    input_file = bgzf.open(self.input, "rt")
1480                if input_format in ["vcf"]:
1481                    header_len = self.get_header_length()
1482                else:
1483                    header_len = 0
1484
1485                # Insert the file contents into a table
1486                self.insert_file_to_table(
1487                    input_file,
1488                    columns=sql_create_table_columns_list_sql,
1489                    header_len=header_len,
1490                    sep=delimiter,
1491                    chunksize=chunksize,
1492                )
1493
1494        else:
1495            log.error(
1496                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1497            )
1498            raise ValueError(
1499                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1500            )
1501
1502        # Explode INFOS fields into table fields
1503        if self.get_explode_infos():
1504            self.explode_infos(
1505                prefix=self.get_explode_infos_prefix(),
1506                fields=self.get_explode_infos_fields(),
1507                force=True,
1508            )
1509
1510        # Create index after insertion
1511        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1513    def get_explode_infos(self) -> bool:
1514        """
1515        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1516        to False if it is not set.
1517        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1518        value. If the parameter is not present, it will return False.
1519        """
1520
1521        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1523    def get_explode_infos_fields(
1524        self,
1525        explode_infos_fields: str = None,
1526        remove_fields_not_in_header: bool = False,
1527    ) -> list:
1528        """
1529        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1530        the input parameter `explode_infos_fields`.
1531
1532        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1533        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1534        comma-separated list of field names to explode
1535        :type explode_infos_fields: str
1536        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1537        flag that determines whether to remove fields that are not present in the header. If it is set
1538        to `True`, any field that is not in the header will be excluded from the list of exploded
1539        information fields. If it is set to `, defaults to False
1540        :type remove_fields_not_in_header: bool (optional)
1541        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1542        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1543        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1544        Otherwise, it returns a list of exploded information fields after removing any spaces and
1545        splitting the string by commas.
1546        """
1547
1548        # If no fields, get it in param
1549        if not explode_infos_fields:
1550            explode_infos_fields = (
1551                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1552            )
1553
1554        # If no fields, defined as all fields in header using keyword
1555        if not explode_infos_fields:
1556            explode_infos_fields = "*"
1557
1558        # If fields list not empty
1559        if explode_infos_fields:
1560
1561            # Input fields list
1562            if isinstance(explode_infos_fields, str):
1563                fields_input = explode_infos_fields.split(",")
1564            elif isinstance(explode_infos_fields, list):
1565                fields_input = explode_infos_fields
1566            else:
1567                fields_input = []
1568
1569            # Fields list without * keyword
1570            fields_without_all = fields_input.copy()
1571            if "*".casefold() in (item.casefold() for item in fields_without_all):
1572                fields_without_all.remove("*")
1573
1574            # Fields in header
1575            fields_in_header = sorted(list(set(self.get_header().infos)))
1576
1577            # Construct list of fields
1578            fields_output = []
1579            for field in fields_input:
1580
1581                # Strip field
1582                field = field.strip()
1583
1584                # format keyword * in regex
1585                if field.upper() in ["*"]:
1586                    field = ".*"
1587
1588                # Find all fields with pattern
1589                r = re.compile(rf"^{field}$")
1590                fields_search = sorted(list(filter(r.match, fields_in_header)))
1591
1592                # Remove fields input from search
1593                if field in fields_search:
1594                    fields_search = [field]
1595                elif fields_search != [field]:
1596                    fields_search = sorted(
1597                        list(set(fields_search).difference(fields_input))
1598                    )
1599
1600                # If field is not in header (avoid not well formatted header)
1601                if not fields_search and not remove_fields_not_in_header:
1602                    fields_search = [field]
1603
1604                # Add found fields
1605                for new_field in fields_search:
1606                    # Add field, if not already exists, and if it is in header (if asked)
1607                    if (
1608                        new_field not in fields_output
1609                        and (
1610                            not remove_fields_not_in_header
1611                            or new_field in fields_in_header
1612                        )
1613                        and new_field not in [".*"]
1614                    ):
1615                        fields_output.append(new_field)
1616
1617            return fields_output
1618
1619        else:
1620
1621            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1623    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1624        """
1625        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1626        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1627        not provided.
1628
1629        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1630        prefix to be used for exploding or expanding information
1631        :type explode_infos_prefix: str
1632        :return: the value of the variable `explode_infos_prefix`.
1633        """
1634
1635        if not explode_infos_prefix:
1636            explode_infos_prefix = (
1637                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1638            )
1639
1640        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1642    def add_column(
1643        self,
1644        table_name,
1645        column_name,
1646        column_type,
1647        default_value=None,
1648        drop: bool = False,
1649    ) -> dict:
1650        """
1651        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1652        doesn't already exist.
1653
1654        :param table_name: The name of the table to which you want to add a column
1655        :param column_name: The parameter "column_name" is the name of the column that you want to add
1656        to the table
1657        :param column_type: The `column_type` parameter specifies the data type of the column that you
1658        want to add to the table. It should be a string that represents the desired data type, such as
1659        "INTEGER", "TEXT", "REAL", etc
1660        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1661        default value for the newly added column. If a default value is provided, it will be assigned to
1662        the column for any existing rows that do not have a value for that column
1663        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1664        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1665        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1666        to False
1667        :type drop: bool (optional)
1668        :return: a boolean value indicating whether the column was successfully added to the table.
1669        """
1670
1671        # added
1672        added = False
1673        dropped = False
1674
1675        # Check if the column already exists in the table
1676        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1677        columns = self.get_query_to_df(query).columns.tolist()
1678        if column_name.upper() in [c.upper() for c in columns]:
1679            log.debug(
1680                f"The {column_name} column already exists in the {table_name} table"
1681            )
1682            if drop:
1683                self.drop_column(table_name=table_name, column_name=column_name)
1684                dropped = True
1685            else:
1686                return None
1687        else:
1688            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1689
1690        # Add column in table
1691        add_column_query = (
1692            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1693        )
1694        if default_value is not None:
1695            add_column_query += f" DEFAULT {default_value}"
1696        self.execute_query(add_column_query)
1697        added = not dropped
1698        log.debug(
1699            f"The {column_name} column was successfully added to the {table_name} table"
1700        )
1701
1702        if added:
1703            added_column = {
1704                "table_name": table_name,
1705                "column_name": column_name,
1706                "column_type": column_type,
1707                "default_value": default_value,
1708            }
1709        else:
1710            added_column = None
1711
1712        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1714    def drop_column(
1715        self, column: dict = None, table_name: str = None, column_name: str = None
1716    ) -> bool:
1717        """
1718        The `drop_column` function drops a specified column from a given table in a database and returns
1719        True if the column was successfully dropped, and False if the column does not exist in the
1720        table.
1721
1722        :param column: The `column` parameter is a dictionary that contains information about the column
1723        you want to drop. It has two keys:
1724        :type column: dict
1725        :param table_name: The `table_name` parameter is the name of the table from which you want to
1726        drop a column
1727        :type table_name: str
1728        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1729        from the table
1730        :type column_name: str
1731        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1732        and False if the column does not exist in the table.
1733        """
1734
1735        # Find column infos
1736        if column:
1737            if isinstance(column, dict):
1738                table_name = column.get("table_name", None)
1739                column_name = column.get("column_name", None)
1740            elif isinstance(column, str):
1741                table_name = self.get_table_variants()
1742                column_name = column
1743            else:
1744                table_name = None
1745                column_name = None
1746
1747        if not table_name and not column_name:
1748            return False
1749
1750        # Removed
1751        removed = False
1752
1753        # Check if the column already exists in the table
1754        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1755        columns = self.get_query_to_df(query).columns.tolist()
1756        if column_name in columns:
1757            log.debug(f"The {column_name} column exists in the {table_name} table")
1758        else:
1759            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1760            return False
1761
1762        # Add column in table # ALTER TABLE integers DROP k
1763        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1764        self.execute_query(add_column_query)
1765        removed = True
1766        log.debug(
1767            f"The {column_name} column was successfully dropped to the {table_name} table"
1768        )
1769
1770        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1772    def explode_infos(
1773        self,
1774        prefix: str = None,
1775        create_index: bool = False,
1776        fields: list = None,
1777        force: bool = False,
1778        proccess_all_fields_together: bool = False,
1779        table: str = None,
1780    ) -> list:
1781        """
1782        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1783        individual columns, returning a list of added columns.
1784
1785        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1786        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1787        `self.get_explode_infos_prefix()` as the prefix
1788        :type prefix: str
1789        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1790        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1791        `False`, indexes will not be created. The default value is `False`, defaults to False
1792        :type create_index: bool (optional)
1793        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1794        that you want to explode into individual columns. If this parameter is not provided, all INFO
1795        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1796        a list to the `
1797        :type fields: list
1798        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1799        determines whether to drop and recreate a column if it already exists in the table. If `force`
1800        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1801        defaults to False
1802        :type force: bool (optional)
1803        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1804        flag that determines whether to process all the INFO fields together or individually. If set to
1805        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1806        be processed individually. The default value is, defaults to False
1807        :type proccess_all_fields_together: bool (optional)
1808        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1809        of the table where the exploded INFO fields will be added as individual columns. If you provide
1810        a value for the `table` parameter, the function will use that table name. If the `table`
1811        parameter is
1812        :type table: str
1813        :return: The `explode_infos` function returns a list of added columns.
1814        """
1815
1816        # drop indexes
1817        self.drop_indexes()
1818
1819        # connexion format
1820        connexion_format = self.get_connexion_format()
1821
1822        # Access
1823        access = self.get_config().get("access", None)
1824
1825        # Added columns
1826        added_columns = []
1827
1828        if access not in ["RO"]:
1829
1830            # prefix
1831            if prefix in [None, True] or not isinstance(prefix, str):
1832                if self.get_explode_infos_prefix() not in [None, True]:
1833                    prefix = self.get_explode_infos_prefix()
1834                else:
1835                    prefix = "INFO/"
1836
1837            # table variants
1838            if table is not None:
1839                table_variants = table
1840            else:
1841                table_variants = self.get_table_variants(clause="select")
1842
1843            # extra infos
1844            try:
1845                extra_infos = self.get_extra_infos()
1846            except:
1847                extra_infos = []
1848
1849            # Header infos
1850            header_infos = self.get_header().infos
1851
1852            log.debug(
1853                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1854            )
1855
1856            sql_info_alter_table_array = []
1857
1858            # Info fields to check
1859            fields_list = list(header_infos)
1860            if fields:
1861                fields_list += fields
1862            fields_list = set(fields_list)
1863
1864            # If no fields
1865            if not fields:
1866                fields = []
1867
1868            # Translate fields if patterns
1869            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1870
1871            for info in fields:
1872
1873                info_id_sql = prefix + info
1874
1875                if (
1876                    info in fields_list
1877                    or prefix + info in fields_list
1878                    or info in extra_infos
1879                ):
1880
1881                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1882
1883                    if info in header_infos:
1884                        info_type = header_infos[info].type
1885                        info_num = header_infos[info].num
1886                    else:
1887                        info_type = "String"
1888                        info_num = 0
1889
1890                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1891                    if info_num != 1:
1892                        type_sql = "VARCHAR"
1893
1894                    # Add field
1895                    added_column = self.add_column(
1896                        table_name=table_variants,
1897                        column_name=info_id_sql,
1898                        column_type=type_sql,
1899                        default_value="null",
1900                        drop=force,
1901                    )
1902
1903                    if added_column:
1904                        added_columns.append(added_column)
1905
1906                    if added_column or force:
1907
1908                        # add field to index
1909                        self.index_additionnal_fields.append(info_id_sql)
1910
1911                        # Update field array
1912                        if connexion_format in ["duckdb"]:
1913                            update_info_field = f"""
1914                            "{info_id_sql}" =
1915                                CASE
1916                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1917                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1918                                END
1919                            """
1920                        elif connexion_format in ["sqlite"]:
1921                            update_info_field = f"""
1922                                "{info_id_sql}" =
1923                                    CASE
1924                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1925                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1926                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1927                                    END
1928                            """
1929
1930                        sql_info_alter_table_array.append(update_info_field)
1931
1932            if sql_info_alter_table_array:
1933
1934                # By chromosomes
1935                try:
1936                    chromosomes_list = list(
1937                        self.get_query_to_df(
1938                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1939                        )["#CHROM"]
1940                    )
1941                except:
1942                    chromosomes_list = [None]
1943
1944                for chrom in chromosomes_list:
1945                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1946
1947                    # Where clause
1948                    where_clause = ""
1949                    if chrom and len(chromosomes_list) > 1:
1950                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1951
1952                    # Update table
1953                    if proccess_all_fields_together:
1954                        sql_info_alter_table_array_join = ", ".join(
1955                            sql_info_alter_table_array
1956                        )
1957                        if sql_info_alter_table_array_join:
1958                            sql_info_alter_table = f"""
1959                                UPDATE {table_variants}
1960                                SET {sql_info_alter_table_array_join}
1961                                {where_clause}
1962                                """
1963                            log.debug(
1964                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1965                            )
1966                            # log.debug(sql_info_alter_table)
1967                            self.conn.execute(sql_info_alter_table)
1968                    else:
1969                        sql_info_alter_num = 0
1970                        for sql_info_alter in sql_info_alter_table_array:
1971                            sql_info_alter_num += 1
1972                            sql_info_alter_table = f"""
1973                                UPDATE {table_variants}
1974                                SET {sql_info_alter}
1975                                {where_clause}
1976                                """
1977                            log.debug(
1978                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1979                            )
1980                            # log.debug(sql_info_alter_table)
1981                            self.conn.execute(sql_info_alter_table)
1982
1983        # create indexes
1984        if create_index:
1985            self.create_indexes()
1986
1987        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1989    def create_indexes(self) -> None:
1990        """
1991        Create indexes on the table after insertion
1992        """
1993
1994        # Access
1995        access = self.get_config().get("access", None)
1996
1997        # get table variants
1998        table_variants = self.get_table_variants("FROM")
1999
2000        if self.get_indexing() and access not in ["RO"]:
2001            # Create index
2002            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
2003            self.conn.execute(sql_create_table_index)
2004            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
2005            self.conn.execute(sql_create_table_index)
2006            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
2007            self.conn.execute(sql_create_table_index)
2008            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
2009            self.conn.execute(sql_create_table_index)
2010            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
2011            self.conn.execute(sql_create_table_index)
2012            for field in self.index_additionnal_fields:
2013                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
2014                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
2016    def drop_indexes(self) -> None:
2017        """
2018        Create indexes on the table after insertion
2019        """
2020
2021        # Access
2022        access = self.get_config().get("access", None)
2023
2024        # get table variants
2025        table_variants = self.get_table_variants("FROM")
2026
2027        # Get database format
2028        connexion_format = self.get_connexion_format()
2029
2030        if access not in ["RO"]:
2031            if connexion_format in ["duckdb"]:
2032                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2033            elif connexion_format in ["sqlite"]:
2034                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2035
2036            list_indexes = self.conn.execute(sql_list_indexes)
2037            index_names = [row[0] for row in list_indexes.fetchall()]
2038            for index in index_names:
2039                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2040                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2042    def read_vcf_header(self, f) -> list:
2043        """
2044        It reads the header of a VCF file and returns a list of the header lines
2045
2046        :param f: the file object
2047        :return: The header lines of the VCF file.
2048        """
2049
2050        header_list = []
2051        for line in f:
2052            header_list.append(line)
2053            if line.startswith("#CHROM"):
2054                break
2055        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2057    def read_vcf_header_file(self, file: str = None) -> list:
2058        """
2059        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2060        uncompressed files.
2061
2062        :param file: The `file` parameter is a string that represents the path to the VCF header file
2063        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2064        default to `None`
2065        :type file: str
2066        :return: The function `read_vcf_header_file` returns a list.
2067        """
2068
2069        if self.get_input_compressed(input_file=file):
2070            with bgzf.open(file, "rt") as f:
2071                return self.read_vcf_header(f=f)
2072        else:
2073            with open(file, "rt") as f:
2074                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2076    def execute_query(self, query: str):
2077        """
2078        It takes a query as an argument, executes it, and returns the results
2079
2080        :param query: The query to be executed
2081        :return: The result of the query is being returned.
2082        """
2083        if query:
2084            return self.conn.execute(query)  # .fetchall()
2085        else:
2086            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None, fields_to_rename: dict | None = None) -> bool:
2088    def export_output(
2089        self,
2090        output_file: str | None = None,
2091        output_header: str | None = None,
2092        export_header: bool = True,
2093        query: str | None = None,
2094        parquet_partitions: list | None = None,
2095        chunk_size: int | None = None,
2096        threads: int | None = None,
2097        sort: bool = False,
2098        index: bool = False,
2099        order_by: str | None = None,
2100        fields_to_rename: dict | None = None,
2101    ) -> bool:
2102        """
2103        The `export_output` function exports data from a VCF file to various formats, including VCF,
2104        CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
2105        partitioning.
2106
2107        :param output_file: The `output_file` parameter is a string that specifies the name of the
2108        output file where the exported data will be saved
2109        :type output_file: str | None
2110        :param output_header: The `output_header` parameter is a string that specifies the name of the
2111        file where the header of the VCF file will be exported. If this parameter is not provided, the
2112        header will be exported to a file with the same name as the `output_file` parameter, but with
2113        the extension "
2114        :type output_header: str | None
2115        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2116        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2117        True, the header will be exported to a file. If `export_header` is False, the header will not
2118        be, defaults to True
2119        :type export_header: bool (optional)
2120        :param query: The `query` parameter in the `export_output` function is an optional SQL query
2121        that can be used to filter and select specific data from the VCF file before exporting it. If
2122        provided, only the data that matches the query will be exported. This allows you to customize
2123        the exported data based on
2124        :type query: str | None
2125        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2126        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2127        organize data in a hierarchical directory structure based on the values of one or more columns.
2128        This can improve query performance when working with large datasets
2129        :type parquet_partitions: list | None
2130        :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when
2131        exporting data in Parquet format. This parameter is used for partitioning the Parquet file into
2132        multiple files. It helps in optimizing the export process by breaking down the data into
2133        manageable chunks for processing and storage
2134        :type chunk_size: int | None
2135        :param threads: The `threads` parameter in the `export_output` function specifies the number of
2136        threads to be used during the export process. It determines the level of parallelism and can
2137        improve the performance of the export operation. If this parameter is not provided, the function
2138        will use the default number of threads
2139        :type threads: int | None
2140        :param sort: The `sort` parameter in the `export_output` function is a boolean flag that
2141        determines whether the output file should be sorted based on genomic coordinates of the
2142        variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to
2143        `False`,, defaults to False
2144        :type sort: bool (optional)
2145        :param index: The `index` parameter in the `export_output` function is a boolean flag that
2146        determines whether an index should be created on the output file. If `index` is set to `True`,
2147        an index will be created on the output file. If `index` is set to `False`, no, defaults to False
2148        :type index: bool (optional)
2149        :param order_by: The `order_by` parameter in the `export_output` function is a string that
2150        specifies the column(s) to use for sorting the output file. This parameter is only applicable
2151        when exporting data in VCF format. It allows you to specify the column(s) based on which the
2152        output file should be
2153        :type order_by: str | None
2154        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the
2155        mapping of field names to be renamed during the export process. This parameter allows you to
2156        customize the output field names before exporting the data. Each key-value pair in the
2157        dictionary represents the original field name as the key and the new field name
2158        :type fields_to_rename: dict | None
2159        :return: The `export_output` function returns a boolean value. It checks if the output file
2160        exists and returns True if it does, or None if it doesn't.
2161        """
2162
2163        # Log
2164        log.info("Exporting...")
2165
2166        # Full path
2167        output_file = full_path(output_file)
2168        output_header = full_path(output_header)
2169
2170        # Config
2171        config = self.get_config()
2172
2173        # Param
2174        param = self.get_param()
2175
2176        # Tmp files to remove
2177        tmp_to_remove = []
2178
2179        # If no output, get it
2180        if not output_file:
2181            output_file = self.get_output()
2182
2183        # If not threads
2184        if not threads:
2185            threads = self.get_threads()
2186
2187        # Rename fields
2188        if not fields_to_rename:
2189            fields_to_rename = param.get("export", {}).get("fields_to_rename", None)
2190        self.rename_info_fields(fields_to_rename=fields_to_rename)
2191
2192        # Auto header name with extension
2193        if export_header or output_header:
2194            if not output_header:
2195                output_header = f"{output_file}.hdr"
2196            # Export header
2197            self.export_header(output_file=output_file)
2198
2199        # Switch off export header if VCF output
2200        output_file_type = get_file_format(output_file)
2201        if output_file_type in ["vcf"]:
2202            export_header = False
2203            tmp_to_remove.append(output_header)
2204
2205        # Chunk size
2206        if not chunk_size:
2207            chunk_size = config.get("chunk_size", None)
2208
2209        # Parquet partition
2210        if not parquet_partitions:
2211            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2212        if parquet_partitions and isinstance(parquet_partitions, str):
2213            parquet_partitions = parquet_partitions.split(",")
2214
2215        # Order by
2216        if not order_by:
2217            order_by = param.get("export", {}).get("order_by", "")
2218
2219        # Header in output
2220        header_in_output = param.get("export", {}).get("include_header", False)
2221
2222        # Database
2223        database_source = self.get_connexion()
2224
2225        # Connexion format
2226        connexion_format = self.get_connexion_format()
2227
2228        # Explode infos
2229        if self.get_explode_infos():
2230            self.explode_infos(
2231                prefix=self.get_explode_infos_prefix(),
2232                fields=self.get_explode_infos_fields(),
2233                force=False,
2234            )
2235
2236        # if connexion_format in ["sqlite"] or query:
2237        if connexion_format in ["sqlite"]:
2238
2239            # Export in Parquet
2240            random_tmp = "".join(
2241                random.choice(string.ascii_lowercase) for i in range(10)
2242            )
2243            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2244            tmp_to_remove.append(database_source)
2245
2246            # Table Variants
2247            table_variants = self.get_table_variants()
2248
2249            # Create export query
2250            sql_query_export_subquery = f"""
2251                SELECT * FROM {table_variants}
2252                """
2253
2254            # Write source file
2255            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2256
2257        # Create database
2258        database = Database(
2259            database=database_source,
2260            table="variants",
2261            header_file=output_header,
2262            conn_config=self.get_connexion_config(),
2263        )
2264
2265        # Existing colomns header
2266        existing_columns_header = database.get_header_columns_from_database(query=query)
2267
2268        # Sample list
2269        if output_file_type in ["vcf"]:
2270            get_samples = self.get_samples()
2271            get_samples_check = self.get_samples_check()
2272            samples_force = get_samples is not None
2273            sample_list = self.get_header_sample_list(
2274                check=get_samples_check,
2275                samples=get_samples,
2276                samples_force=samples_force,
2277            )
2278        else:
2279            sample_list = None
2280
2281        # Export file
2282        database.export(
2283            output_database=output_file,
2284            output_header=output_header,
2285            existing_columns_header=existing_columns_header,
2286            parquet_partitions=parquet_partitions,
2287            chunk_size=chunk_size,
2288            threads=threads,
2289            sort=sort,
2290            index=index,
2291            header_in_output=header_in_output,
2292            order_by=order_by,
2293            query=query,
2294            export_header=export_header,
2295            sample_list=sample_list,
2296        )
2297
2298        # Remove
2299        remove_if_exists(tmp_to_remove)
2300
2301        return (os.path.exists(output_file) or None) and (
2302            os.path.exists(output_file) or None
2303        )

The export_output function exports data from a VCF file to various formats, including VCF, CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and partitioning.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True
  • query: The query parameter in the export_output function is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage
  • threads: The threads parameter in the export_output function specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads
  • sort: The sort parameter in the export_output function is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. If sort is set to True, the output file will be sorted. If sort is set to False,, defaults to False
  • index: The index parameter in the export_output function is a boolean flag that determines whether an index should be created on the output file. If index is set to True, an index will be created on the output file. If index is set to False, no, defaults to False
  • order_by: The order_by parameter in the export_output function is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be
  • fields_to_rename: The fields_to_rename parameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns

The export_output function returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2305    def get_extra_infos(self, table: str = None) -> list:
2306        """
2307        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2308        in the header.
2309
2310        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2311        name of the table from which you want to retrieve the extra columns that are not present in the
2312        header. If the `table` parameter is not provided when calling the function, it will default to
2313        using the variants
2314        :type table: str
2315        :return: A list of columns that are in the specified table but not in the header of the table.
2316        """
2317
2318        header_columns = []
2319
2320        if not table:
2321            table = self.get_table_variants(clause="from")
2322            header_columns = self.get_header_columns()
2323
2324        # Check all columns in the database
2325        query = f""" SELECT * FROM {table} LIMIT 1 """
2326        log.debug(f"query {query}")
2327        table_columns = self.get_query_to_df(query).columns.tolist()
2328        extra_columns = []
2329
2330        # Construct extra infos (not in header)
2331        for column in table_columns:
2332            if column not in header_columns:
2333                extra_columns.append(column)
2334
2335        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2337    def get_extra_infos_sql(self, table: str = None) -> str:
2338        """
2339        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2340        by double quotes
2341
2342        :param table: The name of the table to get the extra infos from. If None, the default table is
2343        used
2344        :type table: str
2345        :return: A string of the extra infos
2346        """
2347
2348        return ", ".join(
2349            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2350        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2352    def export_header(
2353        self,
2354        header_name: str = None,
2355        output_file: str = None,
2356        output_file_ext: str = ".hdr",
2357        clean_header: bool = True,
2358        remove_chrom_line: bool = False,
2359    ) -> str:
2360        """
2361        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2362        specified options, and writes it to a new file.
2363
2364        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2365        this parameter is not specified, the header will be written to the output file
2366        :type header_name: str
2367        :param output_file: The `output_file` parameter in the `export_header` function is used to
2368        specify the name of the output file where the header will be written. If this parameter is not
2369        provided, the header will be written to a temporary file
2370        :type output_file: str
2371        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2372        string that represents the extension of the output header file. By default, it is set to ".hdr"
2373        if not specified by the user. This extension will be appended to the `output_file` name to
2374        create the final, defaults to .hdr
2375        :type output_file_ext: str (optional)
2376        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2377        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2378        `True`, the function will clean the header by modifying certain lines based on a specific
2379        pattern. If `clean_header`, defaults to True
2380        :type clean_header: bool (optional)
2381        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2382        boolean flag that determines whether the #CHROM line should be removed from the header before
2383        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2384        defaults to False
2385        :type remove_chrom_line: bool (optional)
2386        :return: The function `export_header` returns the name of the temporary header file that is
2387        created.
2388        """
2389
2390        if not header_name and not output_file:
2391            output_file = self.get_output()
2392
2393        if self.get_header():
2394
2395            # Get header object
2396            header_obj = self.get_header()
2397
2398            # Create database
2399            db_for_header = Database(database=self.get_input())
2400
2401            # Get real columns in the file
2402            db_header_columns = db_for_header.get_columns()
2403
2404            with tempfile.TemporaryDirectory() as tmpdir:
2405
2406                # Write header file
2407                header_file_tmp = os.path.join(tmpdir, "header")
2408                f = open(header_file_tmp, "w")
2409                vcf.Writer(f, header_obj)
2410                f.close()
2411
2412                # Replace #CHROM line with rel columns
2413                header_list = db_for_header.read_header_file(
2414                    header_file=header_file_tmp
2415                )
2416                header_list[-1] = "\t".join(db_header_columns)
2417
2418                # Remove CHROM line
2419                if remove_chrom_line:
2420                    header_list.pop()
2421
2422                # Clean header
2423                if clean_header:
2424                    header_list_clean = []
2425                    for head in header_list:
2426                        # Clean head for malformed header
2427                        head_clean = head
2428                        head_clean = re.subn(
2429                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2430                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2431                            head_clean,
2432                            2,
2433                        )[0]
2434                        # Write header
2435                        header_list_clean.append(head_clean)
2436                    header_list = header_list_clean
2437
2438            tmp_header_name = output_file + output_file_ext
2439
2440            f = open(tmp_header_name, "w")
2441            for line in header_list:
2442                f.write(line)
2443            f.close()
2444
2445        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2447    def export_variant_vcf(
2448        self,
2449        vcf_file,
2450        remove_info: bool = False,
2451        add_samples: bool = True,
2452        list_samples: list = [],
2453        where_clause: str = "",
2454        index: bool = False,
2455        threads: int | None = None,
2456    ) -> bool | None:
2457        """
2458        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2459        remove INFO field, add samples, and control compression and indexing.
2460
2461        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2462        written to. It is the output file that will contain the filtered VCF data based on the specified
2463        parameters
2464        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2465        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2466        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2467        in, defaults to False
2468        :type remove_info: bool (optional)
2469        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2470        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2471        If set to False, the samples will be removed. The default value is True, defaults to True
2472        :type add_samples: bool (optional)
2473        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2474        in the output VCF file. By default, all samples will be included. If you provide a list of
2475        samples, only those samples will be included in the output file
2476        :type list_samples: list
2477        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2478        determines whether or not to create an index for the output VCF file. If `index` is set to
2479        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2480        :type index: bool (optional)
2481        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2482        number of threads to use for exporting the VCF file. It determines how many parallel threads
2483        will be used during the export process. More threads can potentially speed up the export process
2484        by utilizing multiple cores of the processor. If
2485        :type threads: int | None
2486        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2487        method with various parameters including the output file, query, threads, sort flag, and index
2488        flag. The `export_output` method is responsible for exporting the VCF data based on the
2489        specified parameters and configurations provided in the `export_variant_vcf` function.
2490        """
2491
2492        # Config
2493        config = self.get_config()
2494
2495        # Extract VCF
2496        log.debug("Export VCF...")
2497
2498        # Table variants
2499        table_variants = self.get_table_variants()
2500
2501        # Threads
2502        if not threads:
2503            threads = self.get_threads()
2504
2505        # Info fields
2506        if remove_info:
2507            if not isinstance(remove_info, str):
2508                remove_info = "."
2509            info_field = f"""'{remove_info}' as INFO"""
2510        else:
2511            info_field = "INFO"
2512
2513        # Samples fields
2514        if add_samples:
2515            if not list_samples:
2516                list_samples = self.get_header_sample_list()
2517            if list_samples:
2518                samples_fields = " , FORMAT , " + " , ".join(
2519                    [f""" "{sample}" """ for sample in list_samples]
2520                )
2521            else:
2522                samples_fields = ""
2523            log.debug(f"samples_fields: {samples_fields}")
2524        else:
2525            samples_fields = ""
2526
2527        # Where clause
2528        if where_clause is None:
2529            where_clause = ""
2530
2531        # Variants
2532        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2533        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2534        log.debug(f"sql_query_select={sql_query_select}")
2535
2536        return self.export_output(
2537            output_file=vcf_file,
2538            output_header=None,
2539            export_header=True,
2540            query=sql_query_select,
2541            parquet_partitions=None,
2542            chunk_size=config.get("chunk_size", None),
2543            threads=threads,
2544            sort=True,
2545            index=index,
2546            order_by=None,
2547        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2549    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2550        """
2551        It takes a list of commands and runs them in parallel using the number of threads specified
2552
2553        :param commands: A list of commands to run
2554        :param threads: The number of threads to use, defaults to 1 (optional)
2555        """
2556
2557        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2559    def get_threads(self, default: int = 1) -> int:
2560        """
2561        This function returns the number of threads to use for a job, with a default value of 1 if not
2562        specified.
2563
2564        :param default: The `default` parameter in the `get_threads` method is used to specify the
2565        default number of threads to use if no specific value is provided. If no value is provided for
2566        the `threads` parameter in the configuration or input parameters, the `default` value will be
2567        used, defaults to 1
2568        :type default: int (optional)
2569        :return: the number of threads to use for the current job.
2570        """
2571
2572        # Config
2573        config = self.get_config()
2574
2575        # Param
2576        param = self.get_param()
2577
2578        # Input threads
2579        input_thread = param.get("threads", config.get("threads", None))
2580
2581        # Check threads
2582        if not input_thread:
2583            threads = default
2584        elif int(input_thread) <= 0:
2585            threads = os.cpu_count()
2586        else:
2587            threads = int(input_thread)
2588        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2590    def get_memory(self, default: str = None) -> str:
2591        """
2592        This function retrieves the memory value from parameters or configuration with a default value
2593        if not found.
2594
2595        :param default: The `get_memory` function takes in a default value as a string parameter. This
2596        default value is used as a fallback in case the `memory` parameter is not provided in the
2597        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2598        the function
2599        :type default: str
2600        :return: The `get_memory` function returns a string value representing the memory parameter. If
2601        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2602        return the default value provided as an argument to the function.
2603        """
2604
2605        # Config
2606        config = self.get_config()
2607
2608        # Param
2609        param = self.get_param()
2610
2611        # Input threads
2612        input_memory = param.get("memory", config.get("memory", None))
2613
2614        # Check threads
2615        if input_memory:
2616            memory = input_memory
2617        else:
2618            memory = default
2619
2620        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2622    def update_from_vcf(self, vcf_file: str) -> None:
2623        """
2624        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2625
2626        :param vcf_file: the path to the VCF file
2627        """
2628
2629        connexion_format = self.get_connexion_format()
2630
2631        if connexion_format in ["duckdb"]:
2632            self.update_from_vcf_duckdb(vcf_file)
2633        elif connexion_format in ["sqlite"]:
2634            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2636    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2637        """
2638        It takes a VCF file and updates the INFO column of the variants table in the database with the
2639        INFO column of the VCF file
2640
2641        :param vcf_file: the path to the VCF file
2642        """
2643
2644        # varaints table
2645        table_variants = self.get_table_variants()
2646
2647        # Loading VCF into temporaire table
2648        skip = self.get_header_length(file=vcf_file)
2649        vcf_df = pd.read_csv(
2650            vcf_file,
2651            sep="\t",
2652            engine="c",
2653            skiprows=skip,
2654            header=0,
2655            low_memory=False,
2656        )
2657        sql_query_update = f"""
2658        UPDATE {table_variants} as table_variants
2659            SET INFO = concat(
2660                            CASE
2661                                WHEN INFO NOT IN ('', '.')
2662                                THEN INFO
2663                                ELSE ''
2664                            END,
2665                            (
2666                                SELECT 
2667                                    concat(
2668                                        CASE
2669                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2670                                            THEN ';'
2671                                            ELSE ''
2672                                        END
2673                                        ,
2674                                        CASE
2675                                            WHEN table_parquet.INFO NOT IN ('','.')
2676                                            THEN table_parquet.INFO
2677                                            ELSE ''
2678                                        END
2679                                    )
2680                                FROM vcf_df as table_parquet
2681                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2682                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2683                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2684                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2685                                        AND table_parquet.INFO NOT IN ('','.')
2686                            )
2687                        )
2688            ;
2689            """
2690        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2692    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2693        """
2694        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2695        table, then updates the INFO column of the variants table with the INFO column of the temporary
2696        table
2697
2698        :param vcf_file: The path to the VCF file you want to update the database with
2699        """
2700
2701        # Create a temporary table for the VCF
2702        table_vcf = "tmp_vcf"
2703        sql_create = (
2704            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2705        )
2706        self.conn.execute(sql_create)
2707
2708        # Loading VCF into temporaire table
2709        vcf_df = pd.read_csv(
2710            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2711        )
2712        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2713        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2714
2715        # Update table 'variants' with VCF data
2716        # warning: CONCAT as || operator
2717        sql_query_update = f"""
2718            UPDATE variants as table_variants
2719            SET INFO = CASE
2720                            WHEN INFO NOT IN ('', '.')
2721                            THEN INFO
2722                            ELSE ''
2723                        END ||
2724                        (
2725                        SELECT 
2726                            CASE 
2727                                WHEN table_variants.INFO NOT IN ('','.') 
2728                                    AND table_vcf.INFO NOT IN ('','.')  
2729                                THEN ';' 
2730                                ELSE '' 
2731                            END || 
2732                            CASE 
2733                                WHEN table_vcf.INFO NOT IN ('','.') 
2734                                THEN table_vcf.INFO 
2735                                ELSE '' 
2736                            END
2737                        FROM {table_vcf} as table_vcf
2738                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2739                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2740                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2741                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2742                        )
2743        """
2744        self.conn.execute(sql_query_update)
2745
2746        # Drop temporary table
2747        sql_drop = f"DROP TABLE {table_vcf}"
2748        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2750    def drop_variants_table(self) -> None:
2751        """
2752        > This function drops the variants table
2753        """
2754
2755        table_variants = self.get_table_variants()
2756        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2757        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2759    def set_variant_id(
2760        self, variant_id_column: str = "variant_id", force: bool = None
2761    ) -> str:
2762        """
2763        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2764        `#CHROM`, `POS`, `REF`, and `ALT` columns
2765
2766        :param variant_id_column: The name of the column to be created in the variants table, defaults
2767        to variant_id
2768        :type variant_id_column: str (optional)
2769        :param force: If True, the variant_id column will be created even if it already exists
2770        :type force: bool
2771        :return: The name of the column that contains the variant_id
2772        """
2773
2774        # Assembly
2775        assembly = self.get_param().get(
2776            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2777        )
2778
2779        # INFO/Tag prefix
2780        prefix = self.get_explode_infos_prefix()
2781
2782        # Explode INFO/SVTYPE
2783        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2784
2785        # variants table
2786        table_variants = self.get_table_variants()
2787
2788        # variant_id column
2789        if not variant_id_column:
2790            variant_id_column = "variant_id"
2791
2792        # Creta variant_id column
2793        if "variant_id" not in self.get_extra_infos() or force:
2794
2795            # Create column
2796            self.add_column(
2797                table_name=table_variants,
2798                column_name=variant_id_column,
2799                column_type="UBIGINT",
2800                default_value="0",
2801            )
2802
2803            # Update column
2804            self.conn.execute(
2805                f"""
2806                    UPDATE {table_variants}
2807                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2808                """
2809            )
2810
2811        # Remove added columns
2812        for added_column in added_columns:
2813            self.drop_column(column=added_column)
2814
2815        # return variant_id column name
2816        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2818    def get_variant_id_column(
2819        self, variant_id_column: str = "variant_id", force: bool = None
2820    ) -> str:
2821        """
2822        This function returns the variant_id column name
2823
2824        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2825        defaults to variant_id
2826        :type variant_id_column: str (optional)
2827        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2828        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2829        if it is not already set, or if it is set
2830        :type force: bool
2831        :return: The variant_id column name.
2832        """
2833
2834        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2840    def scan_databases(
2841        self,
2842        database_formats: list = ["parquet"],
2843        database_releases: list = ["current"],
2844    ) -> dict:
2845        """
2846        The function `scan_databases` scans for available databases based on specified formats and
2847        releases.
2848
2849        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2850        of the databases to be scanned. In this case, the accepted format is "parquet"
2851        :type database_formats: list ["parquet"]
2852        :param database_releases: The `database_releases` parameter is a list that specifies the
2853        releases of the databases to be scanned. In the provided function, the default value for
2854        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2855        databases that are in the "current"
2856        :type database_releases: list
2857        :return: The function `scan_databases` returns a dictionary containing information about
2858        databases that match the specified formats and releases.
2859        """
2860
2861        # Config
2862        config = self.get_config()
2863
2864        # Param
2865        param = self.get_param()
2866
2867        # Param - Assembly
2868        assembly = param.get("assembly", config.get("assembly", None))
2869        if not assembly:
2870            assembly = DEFAULT_ASSEMBLY
2871            log.warning(f"Default assembly '{assembly}'")
2872
2873        # Scan for availabled databases
2874        log.info(
2875            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2876        )
2877        databases_infos_dict = databases_infos(
2878            database_folder_releases=database_releases,
2879            database_formats=database_formats,
2880            assembly=assembly,
2881            config=config,
2882        )
2883        log.info(
2884            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2885        )
2886
2887        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2889    def annotation(self) -> None:
2890        """
2891        It annotates the VCF file with the annotations specified in the config file.
2892        """
2893
2894        # Config
2895        config = self.get_config()
2896
2897        # Param
2898        param = self.get_param()
2899
2900        # Param - Assembly
2901        assembly = param.get("assembly", config.get("assembly", None))
2902        if not assembly:
2903            assembly = DEFAULT_ASSEMBLY
2904            log.warning(f"Default assembly '{assembly}'")
2905
2906        # annotations databases folders
2907        annotations_databases = set(
2908            config.get("folders", {})
2909            .get("databases", {})
2910            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2911            + config.get("folders", {})
2912            .get("databases", {})
2913            .get("parquet", ["~/howard/databases/parquet/current"])
2914            + config.get("folders", {})
2915            .get("databases", {})
2916            .get("bcftools", ["~/howard/databases/bcftools/current"])
2917        )
2918
2919        # Get param annotations
2920        if param.get("annotations", None) and isinstance(
2921            param.get("annotations", None), str
2922        ):
2923            log.debug(param.get("annotations", None))
2924            param_annotation_list = param.get("annotations").split(",")
2925        else:
2926            param_annotation_list = []
2927
2928        # Each tools param
2929        if param.get("annotation_parquet", None) != None:
2930            log.debug(
2931                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2932            )
2933            if isinstance(param.get("annotation_parquet", None), list):
2934                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2935            else:
2936                param_annotation_list.append(param.get("annotation_parquet"))
2937        if param.get("annotation_snpsift", None) != None:
2938            if isinstance(param.get("annotation_snpsift", None), list):
2939                param_annotation_list.append(
2940                    "snpsift:"
2941                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2942                )
2943            else:
2944                param_annotation_list.append(
2945                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2946                )
2947        if param.get("annotation_snpeff", None) != None:
2948            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2949        if param.get("annotation_bcftools", None) != None:
2950            if isinstance(param.get("annotation_bcftools", None), list):
2951                param_annotation_list.append(
2952                    "bcftools:"
2953                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2954                )
2955            else:
2956                param_annotation_list.append(
2957                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2958                )
2959        if param.get("annotation_annovar", None) != None:
2960            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2961        if param.get("annotation_exomiser", None) != None:
2962            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2963        if param.get("annotation_splice", None) != None:
2964            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2965
2966        # Merge param annotations list
2967        param["annotations"] = ",".join(param_annotation_list)
2968
2969        # debug
2970        log.debug(f"param_annotations={param['annotations']}")
2971
2972        if param.get("annotations"):
2973
2974            # Log
2975            # log.info("Annotations - Check annotation parameters")
2976
2977            if not "annotation" in param:
2978                param["annotation"] = {}
2979
2980            # List of annotations parameters
2981            annotations_list_input = {}
2982            if isinstance(param.get("annotations", None), str):
2983                annotation_file_list = [
2984                    value for value in param.get("annotations", "").split(",")
2985                ]
2986                for annotation_file in annotation_file_list:
2987                    annotations_list_input[annotation_file.strip()] = {"INFO": None}
2988            else:
2989                annotations_list_input = param.get("annotations", {})
2990
2991            log.info(f"Quick Annotations:")
2992            for annotation_key in list(annotations_list_input.keys()):
2993                log.info(f"   {annotation_key}")
2994
2995            # List of annotations and associated fields
2996            annotations_list = {}
2997
2998            for annotation_file in annotations_list_input:
2999
3000                # Explode annotations if ALL
3001                if (
3002                    annotation_file.upper() == "ALL"
3003                    or annotation_file.upper().startswith("ALL:")
3004                ):
3005
3006                    # check ALL parameters (formats, releases)
3007                    annotation_file_split = annotation_file.split(":")
3008                    database_formats = "parquet"
3009                    database_releases = "current"
3010                    for annotation_file_option in annotation_file_split[1:]:
3011                        database_all_options_split = annotation_file_option.split("=")
3012                        if database_all_options_split[0] == "format":
3013                            database_formats = database_all_options_split[1].split("+")
3014                        if database_all_options_split[0] == "release":
3015                            database_releases = database_all_options_split[1].split("+")
3016
3017                    # Scan for availabled databases
3018                    databases_infos_dict = self.scan_databases(
3019                        database_formats=database_formats,
3020                        database_releases=database_releases,
3021                    )
3022
3023                    # Add found databases in annotation parameters
3024                    for database_infos in databases_infos_dict.keys():
3025                        annotations_list[database_infos] = {"INFO": None}
3026
3027                else:
3028                    annotations_list[annotation_file] = annotations_list_input[
3029                        annotation_file
3030                    ]
3031
3032            # Check each databases
3033            if len(annotations_list):
3034
3035                log.info(
3036                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
3037                )
3038
3039                for annotation_file in annotations_list:
3040
3041                    # Init
3042                    annotations = annotations_list.get(annotation_file, None)
3043
3044                    # Annotation snpEff
3045                    if annotation_file.startswith("snpeff"):
3046
3047                        log.debug(f"Quick Annotation snpEff")
3048
3049                        if "snpeff" not in param["annotation"]:
3050                            param["annotation"]["snpeff"] = {}
3051
3052                        if "options" not in param["annotation"]["snpeff"]:
3053                            param["annotation"]["snpeff"]["options"] = ""
3054
3055                        # snpEff options in annotations
3056                        param["annotation"]["snpeff"]["options"] = "".join(
3057                            annotation_file.split(":")[1:]
3058                        )
3059
3060                    # Annotation Annovar
3061                    elif annotation_file.startswith("annovar"):
3062
3063                        log.debug(f"Quick Annotation Annovar")
3064
3065                        if "annovar" not in param["annotation"]:
3066                            param["annotation"]["annovar"] = {}
3067
3068                        if "annotations" not in param["annotation"]["annovar"]:
3069                            param["annotation"]["annovar"]["annotations"] = {}
3070
3071                        # Options
3072                        annotation_file_split = annotation_file.split(":")
3073                        for annotation_file_annotation in annotation_file_split[1:]:
3074                            if annotation_file_annotation:
3075                                param["annotation"]["annovar"]["annotations"][
3076                                    annotation_file_annotation
3077                                ] = annotations
3078
3079                    # Annotation Exomiser
3080                    elif annotation_file.startswith("exomiser"):
3081
3082                        log.debug(f"Quick Annotation Exomiser")
3083
3084                        param["annotation"]["exomiser"] = params_string_to_dict(
3085                            annotation_file
3086                        )
3087
3088                    # Annotation Splice
3089                    elif annotation_file.startswith("splice"):
3090
3091                        log.debug(f"Quick Annotation Splice")
3092
3093                        param["annotation"]["splice"] = params_string_to_dict(
3094                            annotation_file
3095                        )
3096
3097                    # Annotation Parquet or BCFTOOLS
3098                    else:
3099
3100                        # Tools detection
3101                        if annotation_file.startswith("bcftools:"):
3102                            annotation_tool_initial = "bcftools"
3103                            annotation_file = ":".join(annotation_file.split(":")[1:])
3104                        elif annotation_file.startswith("snpsift:"):
3105                            annotation_tool_initial = "snpsift"
3106                            annotation_file = ":".join(annotation_file.split(":")[1:])
3107                        elif annotation_file.startswith("bigwig:"):
3108                            annotation_tool_initial = "bigwig"
3109                            annotation_file = ":".join(annotation_file.split(":")[1:])
3110                        else:
3111                            annotation_tool_initial = None
3112
3113                        # list of files
3114                        annotation_file_list = annotation_file.replace("+", ":").split(
3115                            ":"
3116                        )
3117
3118                        for annotation_file in annotation_file_list:
3119
3120                            if annotation_file:
3121
3122                                # Annotation tool initial
3123                                annotation_tool = annotation_tool_initial
3124
3125                                # Find file
3126                                annotation_file_found = None
3127
3128                                if os.path.exists(annotation_file):
3129                                    annotation_file_found = annotation_file
3130                                elif os.path.exists(full_path(annotation_file)):
3131                                    annotation_file_found = full_path(annotation_file)
3132                                else:
3133                                    # Find within assembly folders
3134                                    for annotations_database in annotations_databases:
3135                                        found_files = find_all(
3136                                            annotation_file,
3137                                            os.path.join(
3138                                                annotations_database, assembly
3139                                            ),
3140                                        )
3141                                        if len(found_files) > 0:
3142                                            annotation_file_found = found_files[0]
3143                                            break
3144                                    if not annotation_file_found and not assembly:
3145                                        # Find within folders
3146                                        for (
3147                                            annotations_database
3148                                        ) in annotations_databases:
3149                                            found_files = find_all(
3150                                                annotation_file, annotations_database
3151                                            )
3152                                            if len(found_files) > 0:
3153                                                annotation_file_found = found_files[0]
3154                                                break
3155                                log.debug(
3156                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3157                                )
3158
3159                                # Full path
3160                                annotation_file_found = full_path(annotation_file_found)
3161
3162                                if annotation_file_found:
3163
3164                                    database = Database(database=annotation_file_found)
3165                                    quick_annotation_format = database.get_format()
3166                                    quick_annotation_is_compressed = (
3167                                        database.is_compressed()
3168                                    )
3169                                    quick_annotation_is_indexed = os.path.exists(
3170                                        f"{annotation_file_found}.tbi"
3171                                    )
3172                                    bcftools_preference = False
3173
3174                                    # Check Annotation Tool
3175                                    if not annotation_tool:
3176                                        if (
3177                                            bcftools_preference
3178                                            and quick_annotation_format
3179                                            in ["vcf", "bed"]
3180                                            and quick_annotation_is_compressed
3181                                            and quick_annotation_is_indexed
3182                                        ):
3183                                            annotation_tool = "bcftools"
3184                                        elif quick_annotation_format in [
3185                                            "vcf",
3186                                            "bed",
3187                                            "tsv",
3188                                            "tsv",
3189                                            "csv",
3190                                            "json",
3191                                            "tbl",
3192                                            "parquet",
3193                                            "duckdb",
3194                                        ]:
3195                                            annotation_tool = "parquet"
3196                                        elif quick_annotation_format in ["bw"]:
3197                                            annotation_tool = "bigwig"
3198                                        else:
3199                                            log.error(
3200                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3201                                            )
3202                                            raise ValueError(
3203                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3204                                            )
3205
3206                                    log.debug(
3207                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3208                                    )
3209
3210                                    # Annotation Tool dispatch
3211                                    if annotation_tool:
3212                                        if annotation_tool not in param["annotation"]:
3213                                            param["annotation"][annotation_tool] = {}
3214                                        if (
3215                                            "annotations"
3216                                            not in param["annotation"][annotation_tool]
3217                                        ):
3218                                            param["annotation"][annotation_tool][
3219                                                "annotations"
3220                                            ] = {}
3221                                        param["annotation"][annotation_tool][
3222                                            "annotations"
3223                                        ][annotation_file_found] = annotations
3224
3225                                else:
3226                                    log.warning(
3227                                        f"Quick Annotation File {annotation_file} does NOT exist"
3228                                    )
3229
3230                self.set_param(param)
3231
3232        if param.get("annotation", None):
3233            log.info("Annotations")
3234            if param.get("annotation", {}).get("parquet", None):
3235                log.info("Annotations 'parquet'...")
3236                self.annotation_parquet()
3237            if param.get("annotation", {}).get("bcftools", None):
3238                log.info("Annotations 'bcftools'...")
3239                self.annotation_bcftools()
3240            if param.get("annotation", {}).get("snpsift", None):
3241                log.info("Annotations 'snpsift'...")
3242                self.annotation_snpsift()
3243            if param.get("annotation", {}).get("bigwig", None):
3244                log.info("Annotations 'bigwig'...")
3245                self.annotation_bigwig()
3246            if param.get("annotation", {}).get("annovar", None):
3247                log.info("Annotations 'annovar'...")
3248                self.annotation_annovar()
3249            if param.get("annotation", {}).get("snpeff", None):
3250                log.info("Annotations 'snpeff'...")
3251                self.annotation_snpeff()
3252            if param.get("annotation", {}).get("exomiser", None) is not None:
3253                log.info("Annotations 'exomiser'...")
3254                self.annotation_exomiser()
3255            if param.get("annotation", {}).get("splice", None) is not None:
3256                log.info("Annotations 'splice' ...")
3257                self.annotation_splice()
3258
3259        # Explode INFOS fields into table fields
3260        if self.get_explode_infos():
3261            self.explode_infos(
3262                prefix=self.get_explode_infos_prefix(),
3263                fields=self.get_explode_infos_fields(),
3264                force=True,
3265            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_bigwig(self, threads: int = None) -> None:
3267    def annotation_bigwig(self, threads: int = None) -> None:
3268        """
3269        The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases.
3270
3271        :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the
3272        number of threads to be used for parallel processing during the annotation process. If the
3273        `threads` parameter is not provided, the method will attempt to determine the optimal number of
3274        threads to use based on the system configuration
3275        :type threads: int
3276        :return: True
3277        """
3278
3279        # DEBUG
3280        log.debug("Start annotation with bigwig databases")
3281
3282        # # Threads
3283        # if not threads:
3284        #     threads = self.get_threads()
3285        # log.debug("Threads: " + str(threads))
3286
3287        # Config
3288        config = self.get_config()
3289        log.debug("Config: " + str(config))
3290
3291        # Config - BCFTools databases folders
3292        databases_folders = set(
3293            self.get_config()
3294            .get("folders", {})
3295            .get("databases", {})
3296            .get("annotations", ["."])
3297            + self.get_config()
3298            .get("folders", {})
3299            .get("databases", {})
3300            .get("bigwig", ["."])
3301        )
3302        log.debug("Databases annotations: " + str(databases_folders))
3303
3304        # Param
3305        annotations = (
3306            self.get_param()
3307            .get("annotation", {})
3308            .get("bigwig", {})
3309            .get("annotations", None)
3310        )
3311        log.debug("Annotations: " + str(annotations))
3312
3313        # Assembly
3314        assembly = self.get_param().get(
3315            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3316        )
3317
3318        # Data
3319        table_variants = self.get_table_variants()
3320
3321        # Check if not empty
3322        log.debug("Check if not empty")
3323        sql_query_chromosomes = (
3324            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3325        )
3326        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3327        if not sql_query_chromosomes_df["count"][0]:
3328            log.info(f"VCF empty")
3329            return
3330
3331        # VCF header
3332        vcf_reader = self.get_header()
3333        log.debug("Initial header: " + str(vcf_reader.infos))
3334
3335        # Existing annotations
3336        for vcf_annotation in self.get_header().infos:
3337
3338            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3339            log.debug(
3340                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3341            )
3342
3343        if annotations:
3344
3345            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3346
3347                # Export VCF file
3348                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3349
3350                # annotation_bigwig_config
3351                annotation_bigwig_config_list = []
3352
3353                for annotation in annotations:
3354                    annotation_fields = annotations[annotation]
3355
3356                    # Annotation Name
3357                    annotation_name = os.path.basename(annotation)
3358
3359                    if not annotation_fields:
3360                        annotation_fields = {"INFO": None}
3361
3362                    log.debug(f"Annotation '{annotation_name}'")
3363                    log.debug(
3364                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3365                    )
3366
3367                    # Create Database
3368                    database = Database(
3369                        database=annotation,
3370                        databases_folders=databases_folders,
3371                        assembly=assembly,
3372                    )
3373
3374                    # Find files
3375                    db_file = database.get_database()
3376                    db_file = full_path(db_file)
3377                    db_hdr_file = database.get_header_file()
3378                    db_hdr_file = full_path(db_hdr_file)
3379                    db_file_type = database.get_format()
3380
3381                    # If db_file is http ?
3382                    if database.get_database().startswith("http"):
3383
3384                        # Datbase is HTTP URL
3385                        db_file_is_http = True
3386
3387                        # DB file keep as URL
3388                        db_file = database.get_database()
3389                        log.warning(
3390                            f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)"
3391                        )
3392
3393                        # Retrieve automatic annotation field name
3394                        annotation_field = clean_annotation_field(
3395                            os.path.basename(db_file).replace(".bw", "")
3396                        )
3397                        log.debug(
3398                            f"Create header file with annotation field '{annotation_field}' is an HTTP URL"
3399                        )
3400
3401                        # Create automatic header file
3402                        db_hdr_file = os.path.join(tmp_dir, "header.hdr")
3403                        with open(db_hdr_file, "w") as f:
3404                            f.write("##fileformat=VCFv4.2\n")
3405                            f.write(
3406                                f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n"""
3407                            )
3408                            f.write(f"#CHROM	START	END	{annotation_field}\n")
3409
3410                    else:
3411
3412                        # Datbase is NOT HTTP URL
3413                        db_file_is_http = False
3414
3415                    # Check index - try to create if not exists
3416                    if (
3417                        db_file is None
3418                        or db_hdr_file is None
3419                        or (not os.path.exists(db_file) and not db_file_is_http)
3420                        or not os.path.exists(db_hdr_file)
3421                        or not db_file_type in ["bw"]
3422                    ):
3423                        # if False:
3424                        log.error("Annotation failed: database not valid")
3425                        log.error(f"Annotation annotation file: {db_file}")
3426                        log.error(f"Annotation annotation file type: {db_file_type}")
3427                        log.error(f"Annotation annotation header: {db_hdr_file}")
3428                        raise ValueError(
3429                            f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}"
3430                        )
3431                    else:
3432
3433                        # Log
3434                        log.debug(
3435                            f"Annotation '{annotation}' - file: "
3436                            + str(db_file)
3437                            + " and "
3438                            + str(db_hdr_file)
3439                        )
3440
3441                        # Load header as VCF object
3442                        db_hdr_vcf = Variants(input=db_hdr_file)
3443                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3444                        log.debug(
3445                            "Annotation database header: "
3446                            + str(db_hdr_vcf_header_infos)
3447                        )
3448
3449                        # For all fields in database
3450                        annotation_fields_full = False
3451                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3452                            annotation_fields = {
3453                                key: key for key in db_hdr_vcf_header_infos
3454                            }
3455                            log.debug(
3456                                "Annotation database header - All annotations added: "
3457                                + str(annotation_fields)
3458                            )
3459                            annotation_fields_full = True
3460
3461                        # Init
3462                        cyvcf2_header_rename_dict = {}
3463                        cyvcf2_header_list = []
3464                        cyvcf2_header_indexes = {}
3465
3466                        # process annotation fields
3467                        for annotation_field in annotation_fields:
3468
3469                            # New annotation name
3470                            annotation_field_new = annotation_fields[annotation_field]
3471
3472                            # Check annotation field and index in header
3473                            if (
3474                                annotation_field
3475                                in db_hdr_vcf.get_header_columns_as_list()
3476                            ):
3477                                annotation_field_index = (
3478                                    db_hdr_vcf.get_header_columns_as_list().index(
3479                                        annotation_field
3480                                    )
3481                                    - 3
3482                                )
3483                                cyvcf2_header_indexes[annotation_field_new] = (
3484                                    annotation_field_index
3485                                )
3486                            else:
3487                                msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'"
3488                                log.error(msg_err)
3489                                raise ValueError(msg_err)
3490
3491                            # Append annotation field in cyvcf2 header list
3492                            cyvcf2_header_rename_dict[annotation_field_new] = (
3493                                db_hdr_vcf_header_infos[annotation_field].id
3494                            )
3495                            cyvcf2_header_list.append(
3496                                {
3497                                    "ID": annotation_field_new,
3498                                    "Number": db_hdr_vcf_header_infos[
3499                                        annotation_field
3500                                    ].num,
3501                                    "Type": db_hdr_vcf_header_infos[
3502                                        annotation_field
3503                                    ].type,
3504                                    "Description": db_hdr_vcf_header_infos[
3505                                        annotation_field
3506                                    ].desc,
3507                                }
3508                            )
3509
3510                            # Add header on VCF
3511                            vcf_reader.infos[annotation_field_new] = vcf.parser._Info(
3512                                annotation_field_new,
3513                                db_hdr_vcf_header_infos[annotation_field].num,
3514                                db_hdr_vcf_header_infos[annotation_field].type,
3515                                db_hdr_vcf_header_infos[annotation_field].desc,
3516                                "HOWARD BigWig annotation",
3517                                "unknown",
3518                                self.code_type_map[
3519                                    db_hdr_vcf_header_infos[annotation_field].type
3520                                ],
3521                            )
3522
3523                        # Load bigwig database
3524                        bw_db = pyBigWig.open(db_file)
3525                        if bw_db.isBigWig():
3526                            log.debug(f"Database '{db_file}' is in 'BigWig' format")
3527                        else:
3528                            msg_err = f"Database '{db_file}' is NOT in 'BigWig' format"
3529                            log.error(msg_err)
3530                            raise ValueError(msg_err)
3531
3532                        annotation_bigwig_config_list.append(
3533                            {
3534                                "db_file": db_file,
3535                                "bw_db": bw_db,
3536                                "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict,
3537                                "cyvcf2_header_list": cyvcf2_header_list,
3538                                "cyvcf2_header_indexes": cyvcf2_header_indexes,
3539                            }
3540                        )
3541
3542                # Annotate
3543                if annotation_bigwig_config_list:
3544
3545                    # Annotation config
3546                    log.debug(
3547                        f"annotation_bigwig_config={annotation_bigwig_config_list}"
3548                    )
3549
3550                    # Export VCF file
3551                    self.export_variant_vcf(
3552                        vcf_file=tmp_vcf_name,
3553                        remove_info=True,
3554                        add_samples=False,
3555                        index=True,
3556                    )
3557
3558                    # Load input tmp file
3559                    input_vcf = cyvcf2.VCF(tmp_vcf_name)
3560
3561                    # Add header in input file
3562                    for annotation_bigwig_config in annotation_bigwig_config_list:
3563                        for cyvcf2_header_field in annotation_bigwig_config.get(
3564                            "cyvcf2_header_list", []
3565                        ):
3566                            log.info(
3567                                f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'"
3568                            )
3569                            input_vcf.add_info_to_header(cyvcf2_header_field)
3570
3571                    # Create output VCF file
3572                    output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz")
3573                    output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf)
3574
3575                    # Fetch variants
3576                    log.info(f"Annotations 'bigwig' start...")
3577                    for variant in input_vcf:
3578
3579                        for annotation_bigwig_config in annotation_bigwig_config_list:
3580
3581                            # DB and indexes
3582                            bw_db = annotation_bigwig_config.get("bw_db", None)
3583                            cyvcf2_header_indexes = annotation_bigwig_config.get(
3584                                "cyvcf2_header_indexes", None
3585                            )
3586
3587                            # Retrieve value from chrom pos
3588                            res = bw_db.values(
3589                                variant.CHROM, variant.POS - 1, variant.POS
3590                            )
3591
3592                            # For each annotation fields (and indexes)
3593                            for cyvcf2_header_index in cyvcf2_header_indexes:
3594
3595                                # If value is NOT nNone
3596                                if not np.isnan(
3597                                    res[cyvcf2_header_indexes[cyvcf2_header_index]]
3598                                ):
3599                                    variant.INFO[cyvcf2_header_index] = res[
3600                                        cyvcf2_header_indexes[cyvcf2_header_index]
3601                                    ]
3602
3603                        # Add record in output file
3604                        output_vcf.write_record(variant)
3605
3606                    # Log
3607                    log.debug(f"Annotation done.")
3608
3609                    # Close and write file
3610                    log.info(f"Annotations 'bigwig' write...")
3611                    output_vcf.close()
3612                    log.debug(f"Write done.")
3613
3614                    # Update variants
3615                    log.info(f"Annotations 'bigwig' update...")
3616                    self.update_from_vcf(output_vcf_file)
3617                    log.debug(f"Update done.")
3618
3619        return True

The function annotation_bigwig annotates variants in a VCF file using bigwig databases.

Parameters
  • threads: The threads parameter in the annotation_bigwig method is used to specify the number of threads to be used for parallel processing during the annotation process. If the threads parameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns

True

def annotation_snpsift(self, threads: int = None) -> None:
3621    def annotation_snpsift(self, threads: int = None) -> None:
3622        """
3623        This function annotate with bcftools
3624
3625        :param threads: Number of threads to use
3626        :return: the value of the variable "return_value".
3627        """
3628
3629        # DEBUG
3630        log.debug("Start annotation with bcftools databases")
3631
3632        # Threads
3633        if not threads:
3634            threads = self.get_threads()
3635        log.debug("Threads: " + str(threads))
3636
3637        # Config
3638        config = self.get_config()
3639        log.debug("Config: " + str(config))
3640
3641        # Config - snpSift
3642        snpsift_bin_command = get_bin_command(
3643            bin="SnpSift.jar",
3644            tool="snpsift",
3645            bin_type="jar",
3646            config=config,
3647            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3648        )
3649        if not snpsift_bin_command:
3650            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3651            log.error(msg_err)
3652            raise ValueError(msg_err)
3653
3654        # Config - bcftools
3655        bcftools_bin_command = get_bin_command(
3656            bin="bcftools",
3657            tool="bcftools",
3658            bin_type="bin",
3659            config=config,
3660            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3661        )
3662        if not bcftools_bin_command:
3663            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3664            log.error(msg_err)
3665            raise ValueError(msg_err)
3666
3667        # Config - BCFTools databases folders
3668        databases_folders = set(
3669            self.get_config()
3670            .get("folders", {})
3671            .get("databases", {})
3672            .get("annotations", ["."])
3673            + self.get_config()
3674            .get("folders", {})
3675            .get("databases", {})
3676            .get("bcftools", ["."])
3677        )
3678        log.debug("Databases annotations: " + str(databases_folders))
3679
3680        # Param
3681        annotations = (
3682            self.get_param()
3683            .get("annotation", {})
3684            .get("snpsift", {})
3685            .get("annotations", None)
3686        )
3687        log.debug("Annotations: " + str(annotations))
3688
3689        # Assembly
3690        assembly = self.get_param().get(
3691            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3692        )
3693
3694        # Data
3695        table_variants = self.get_table_variants()
3696
3697        # Check if not empty
3698        log.debug("Check if not empty")
3699        sql_query_chromosomes = (
3700            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3701        )
3702        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3703        if not sql_query_chromosomes_df["count"][0]:
3704            log.info(f"VCF empty")
3705            return
3706
3707        # VCF header
3708        vcf_reader = self.get_header()
3709        log.debug("Initial header: " + str(vcf_reader.infos))
3710
3711        # Existing annotations
3712        for vcf_annotation in self.get_header().infos:
3713
3714            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3715            log.debug(
3716                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3717            )
3718
3719        if annotations:
3720
3721            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3722
3723                # Export VCF file
3724                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3725
3726                # Init
3727                commands = {}
3728
3729                for annotation in annotations:
3730                    annotation_fields = annotations[annotation]
3731
3732                    # Annotation Name
3733                    annotation_name = os.path.basename(annotation)
3734
3735                    if not annotation_fields:
3736                        annotation_fields = {"INFO": None}
3737
3738                    log.debug(f"Annotation '{annotation_name}'")
3739                    log.debug(
3740                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3741                    )
3742
3743                    # Create Database
3744                    database = Database(
3745                        database=annotation,
3746                        databases_folders=databases_folders,
3747                        assembly=assembly,
3748                    )
3749
3750                    # Find files
3751                    db_file = database.get_database()
3752                    db_file = full_path(db_file)
3753                    db_hdr_file = database.get_header_file()
3754                    db_hdr_file = full_path(db_hdr_file)
3755                    db_file_type = database.get_format()
3756                    db_tbi_file = f"{db_file}.tbi"
3757                    db_file_compressed = database.is_compressed()
3758
3759                    # Check if compressed
3760                    if not db_file_compressed:
3761                        log.error(
3762                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3763                        )
3764                        raise ValueError(
3765                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3766                        )
3767
3768                    # Check if indexed
3769                    if not os.path.exists(db_tbi_file):
3770                        log.error(
3771                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3772                        )
3773                        raise ValueError(
3774                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3775                        )
3776
3777                    # Check index - try to create if not exists
3778                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3779                        log.error("Annotation failed: database not valid")
3780                        log.error(f"Annotation annotation file: {db_file}")
3781                        log.error(f"Annotation annotation header: {db_hdr_file}")
3782                        log.error(f"Annotation annotation index: {db_tbi_file}")
3783                        raise ValueError(
3784                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3785                        )
3786                    else:
3787
3788                        log.debug(
3789                            f"Annotation '{annotation}' - file: "
3790                            + str(db_file)
3791                            + " and "
3792                            + str(db_hdr_file)
3793                        )
3794
3795                        # Load header as VCF object
3796                        db_hdr_vcf = Variants(input=db_hdr_file)
3797                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3798                        log.debug(
3799                            "Annotation database header: "
3800                            + str(db_hdr_vcf_header_infos)
3801                        )
3802
3803                        # For all fields in database
3804                        annotation_fields_full = False
3805                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3806                            annotation_fields = {
3807                                key: key for key in db_hdr_vcf_header_infos
3808                            }
3809                            log.debug(
3810                                "Annotation database header - All annotations added: "
3811                                + str(annotation_fields)
3812                            )
3813                            annotation_fields_full = True
3814
3815                        # # Create file for field rename
3816                        # log.debug("Create file for field rename")
3817                        # tmp_rename = NamedTemporaryFile(
3818                        #     prefix=self.get_prefix(),
3819                        #     dir=self.get_tmp_dir(),
3820                        #     suffix=".rename",
3821                        #     delete=False,
3822                        # )
3823                        # tmp_rename_name = tmp_rename.name
3824                        # tmp_files.append(tmp_rename_name)
3825
3826                        # Number of fields
3827                        nb_annotation_field = 0
3828                        annotation_list = []
3829                        annotation_infos_rename_list = []
3830
3831                        for annotation_field in annotation_fields:
3832
3833                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3834                            annotation_fields_new_name = annotation_fields.get(
3835                                annotation_field, annotation_field
3836                            )
3837                            if not annotation_fields_new_name:
3838                                annotation_fields_new_name = annotation_field
3839
3840                            # Check if field is in DB and if field is not elready in input data
3841                            if (
3842                                annotation_field in db_hdr_vcf.get_header().infos
3843                                and annotation_fields_new_name
3844                                not in self.get_header().infos
3845                            ):
3846
3847                                log.info(
3848                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3849                                )
3850
3851                                # BCFTools annotate param to rename fields
3852                                if annotation_field != annotation_fields_new_name:
3853                                    annotation_infos_rename_list.append(
3854                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3855                                    )
3856
3857                                # Add INFO field to header
3858                                db_hdr_vcf_header_infos_number = (
3859                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3860                                )
3861                                db_hdr_vcf_header_infos_type = (
3862                                    db_hdr_vcf_header_infos[annotation_field].type
3863                                    or "String"
3864                                )
3865                                db_hdr_vcf_header_infos_description = (
3866                                    db_hdr_vcf_header_infos[annotation_field].desc
3867                                    or f"{annotation_field} description"
3868                                )
3869                                db_hdr_vcf_header_infos_source = (
3870                                    db_hdr_vcf_header_infos[annotation_field].source
3871                                    or "unknown"
3872                                )
3873                                db_hdr_vcf_header_infos_version = (
3874                                    db_hdr_vcf_header_infos[annotation_field].version
3875                                    or "unknown"
3876                                )
3877
3878                                vcf_reader.infos[annotation_fields_new_name] = (
3879                                    vcf.parser._Info(
3880                                        annotation_fields_new_name,
3881                                        db_hdr_vcf_header_infos_number,
3882                                        db_hdr_vcf_header_infos_type,
3883                                        db_hdr_vcf_header_infos_description,
3884                                        db_hdr_vcf_header_infos_source,
3885                                        db_hdr_vcf_header_infos_version,
3886                                        self.code_type_map[
3887                                            db_hdr_vcf_header_infos_type
3888                                        ],
3889                                    )
3890                                )
3891
3892                                annotation_list.append(annotation_field)
3893
3894                                nb_annotation_field += 1
3895
3896                            else:
3897
3898                                if (
3899                                    annotation_field
3900                                    not in db_hdr_vcf.get_header().infos
3901                                ):
3902                                    log.warning(
3903                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3904                                    )
3905                                if (
3906                                    annotation_fields_new_name
3907                                    in self.get_header().infos
3908                                ):
3909                                    log.warning(
3910                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3911                                    )
3912
3913                        log.info(
3914                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3915                        )
3916
3917                        annotation_infos = ",".join(annotation_list)
3918
3919                        if annotation_infos != "":
3920
3921                            # Annotated VCF (and error file)
3922                            tmp_annotation_vcf_name = os.path.join(
3923                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3924                            )
3925                            tmp_annotation_vcf_name_err = (
3926                                tmp_annotation_vcf_name + ".err"
3927                            )
3928
3929                            # Add fields to annotate
3930                            if not annotation_fields_full:
3931                                annotation_infos_option = f"-info {annotation_infos}"
3932                            else:
3933                                annotation_infos_option = ""
3934
3935                            # Info fields rename
3936                            if annotation_infos_rename_list:
3937                                annotation_infos_rename = " -c " + ",".join(
3938                                    annotation_infos_rename_list
3939                                )
3940                            else:
3941                                annotation_infos_rename = ""
3942
3943                            # Annotate command
3944                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3945
3946                            # Add command
3947                            commands[command_annotate] = tmp_annotation_vcf_name
3948
3949                if commands:
3950
3951                    # Export VCF file
3952                    self.export_variant_vcf(
3953                        vcf_file=tmp_vcf_name,
3954                        remove_info=True,
3955                        add_samples=False,
3956                        index=True,
3957                    )
3958                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3959
3960                    # Num command
3961                    nb_command = 0
3962
3963                    # Annotate
3964                    for command_annotate in commands:
3965                        nb_command += 1
3966                        log.info(
3967                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3968                        )
3969                        log.debug(f"command_annotate={command_annotate}")
3970                        run_parallel_commands([command_annotate], threads)
3971
3972                        # Debug
3973                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3974
3975                        # Update variants
3976                        log.info(
3977                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3978                        )
3979                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3981    def annotation_bcftools(self, threads: int = None) -> None:
3982        """
3983        This function annotate with bcftools
3984
3985        :param threads: Number of threads to use
3986        :return: the value of the variable "return_value".
3987        """
3988
3989        # DEBUG
3990        log.debug("Start annotation with bcftools databases")
3991
3992        # Threads
3993        if not threads:
3994            threads = self.get_threads()
3995        log.debug("Threads: " + str(threads))
3996
3997        # Config
3998        config = self.get_config()
3999        log.debug("Config: " + str(config))
4000
4001        # DEBUG
4002        delete_tmp = True
4003        if self.get_config().get("verbosity", "warning") in ["debug"]:
4004            delete_tmp = False
4005            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4006
4007        # Config - BCFTools bin command
4008        bcftools_bin_command = get_bin_command(
4009            bin="bcftools",
4010            tool="bcftools",
4011            bin_type="bin",
4012            config=config,
4013            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
4014        )
4015        if not bcftools_bin_command:
4016            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
4017            log.error(msg_err)
4018            raise ValueError(msg_err)
4019
4020        # Config - BCFTools databases folders
4021        databases_folders = set(
4022            self.get_config()
4023            .get("folders", {})
4024            .get("databases", {})
4025            .get("annotations", ["."])
4026            + self.get_config()
4027            .get("folders", {})
4028            .get("databases", {})
4029            .get("bcftools", ["."])
4030        )
4031        log.debug("Databases annotations: " + str(databases_folders))
4032
4033        # Param
4034        annotations = (
4035            self.get_param()
4036            .get("annotation", {})
4037            .get("bcftools", {})
4038            .get("annotations", None)
4039        )
4040        log.debug("Annotations: " + str(annotations))
4041
4042        # Assembly
4043        assembly = self.get_param().get(
4044            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
4045        )
4046
4047        # Data
4048        table_variants = self.get_table_variants()
4049
4050        # Check if not empty
4051        log.debug("Check if not empty")
4052        sql_query_chromosomes = (
4053            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4054        )
4055        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4056        if not sql_query_chromosomes_df["count"][0]:
4057            log.info(f"VCF empty")
4058            return
4059
4060        # Export in VCF
4061        log.debug("Create initial file to annotate")
4062        tmp_vcf = NamedTemporaryFile(
4063            prefix=self.get_prefix(),
4064            dir=self.get_tmp_dir(),
4065            suffix=".vcf.gz",
4066            delete=False,
4067        )
4068        tmp_vcf_name = tmp_vcf.name
4069
4070        # VCF header
4071        vcf_reader = self.get_header()
4072        log.debug("Initial header: " + str(vcf_reader.infos))
4073
4074        # Existing annotations
4075        for vcf_annotation in self.get_header().infos:
4076
4077            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4078            log.debug(
4079                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4080            )
4081
4082        if annotations:
4083
4084            tmp_ann_vcf_list = []
4085            commands = []
4086            tmp_files = []
4087            err_files = []
4088
4089            for annotation in annotations:
4090                annotation_fields = annotations[annotation]
4091
4092                # Annotation Name
4093                annotation_name = os.path.basename(annotation)
4094
4095                if not annotation_fields:
4096                    annotation_fields = {"INFO": None}
4097
4098                log.debug(f"Annotation '{annotation_name}'")
4099                log.debug(
4100                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
4101                )
4102
4103                # Create Database
4104                database = Database(
4105                    database=annotation,
4106                    databases_folders=databases_folders,
4107                    assembly=assembly,
4108                )
4109
4110                # Find files
4111                db_file = database.get_database()
4112                db_file = full_path(db_file)
4113                db_hdr_file = database.get_header_file()
4114                db_hdr_file = full_path(db_hdr_file)
4115                db_file_type = database.get_format()
4116                db_tbi_file = f"{db_file}.tbi"
4117                db_file_compressed = database.is_compressed()
4118
4119                # Check if compressed
4120                if not db_file_compressed:
4121                    log.error(
4122                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4123                    )
4124                    raise ValueError(
4125                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
4126                    )
4127
4128                # Check if indexed
4129                if not os.path.exists(db_tbi_file):
4130                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
4131                    raise ValueError(
4132                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
4133                    )
4134
4135                # Check index - try to create if not exists
4136                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
4137                    log.error("Annotation failed: database not valid")
4138                    log.error(f"Annotation annotation file: {db_file}")
4139                    log.error(f"Annotation annotation header: {db_hdr_file}")
4140                    log.error(f"Annotation annotation index: {db_tbi_file}")
4141                    raise ValueError(
4142                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
4143                    )
4144                else:
4145
4146                    log.debug(
4147                        f"Annotation '{annotation}' - file: "
4148                        + str(db_file)
4149                        + " and "
4150                        + str(db_hdr_file)
4151                    )
4152
4153                    # Load header as VCF object
4154                    db_hdr_vcf = Variants(input=db_hdr_file)
4155                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
4156                    log.debug(
4157                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
4158                    )
4159
4160                    # For all fields in database
4161                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
4162                        annotation_fields = {
4163                            key: key for key in db_hdr_vcf_header_infos
4164                        }
4165                        log.debug(
4166                            "Annotation database header - All annotations added: "
4167                            + str(annotation_fields)
4168                        )
4169
4170                    # Number of fields
4171                    nb_annotation_field = 0
4172                    annotation_list = []
4173
4174                    for annotation_field in annotation_fields:
4175
4176                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
4177                        annotation_fields_new_name = annotation_fields.get(
4178                            annotation_field, annotation_field
4179                        )
4180                        if not annotation_fields_new_name:
4181                            annotation_fields_new_name = annotation_field
4182
4183                        # Check if field is in DB and if field is not elready in input data
4184                        if (
4185                            annotation_field in db_hdr_vcf.get_header().infos
4186                            and annotation_fields_new_name
4187                            not in self.get_header().infos
4188                        ):
4189
4190                            log.info(
4191                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
4192                            )
4193
4194                            # Add INFO field to header
4195                            db_hdr_vcf_header_infos_number = (
4196                                db_hdr_vcf_header_infos[annotation_field].num or "."
4197                            )
4198                            db_hdr_vcf_header_infos_type = (
4199                                db_hdr_vcf_header_infos[annotation_field].type
4200                                or "String"
4201                            )
4202                            db_hdr_vcf_header_infos_description = (
4203                                db_hdr_vcf_header_infos[annotation_field].desc
4204                                or f"{annotation_field} description"
4205                            )
4206                            db_hdr_vcf_header_infos_source = (
4207                                db_hdr_vcf_header_infos[annotation_field].source
4208                                or "unknown"
4209                            )
4210                            db_hdr_vcf_header_infos_version = (
4211                                db_hdr_vcf_header_infos[annotation_field].version
4212                                or "unknown"
4213                            )
4214
4215                            vcf_reader.infos[annotation_fields_new_name] = (
4216                                vcf.parser._Info(
4217                                    annotation_fields_new_name,
4218                                    db_hdr_vcf_header_infos_number,
4219                                    db_hdr_vcf_header_infos_type,
4220                                    db_hdr_vcf_header_infos_description,
4221                                    db_hdr_vcf_header_infos_source,
4222                                    db_hdr_vcf_header_infos_version,
4223                                    self.code_type_map[db_hdr_vcf_header_infos_type],
4224                                )
4225                            )
4226
4227                            # annotation_list.append(annotation_field)
4228                            if annotation_field != annotation_fields_new_name:
4229                                annotation_list.append(
4230                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
4231                                )
4232                            else:
4233                                annotation_list.append(annotation_field)
4234
4235                            nb_annotation_field += 1
4236
4237                        else:
4238
4239                            if annotation_field not in db_hdr_vcf.get_header().infos:
4240                                log.warning(
4241                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
4242                                )
4243                            if annotation_fields_new_name in self.get_header().infos:
4244                                log.warning(
4245                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
4246                                )
4247
4248                    log.info(
4249                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
4250                    )
4251
4252                    annotation_infos = ",".join(annotation_list)
4253
4254                    if annotation_infos != "":
4255
4256                        # Protect header for bcftools (remove "#CHROM" and variants line)
4257                        log.debug("Protect Header file - remove #CHROM line if exists")
4258                        tmp_header_vcf = NamedTemporaryFile(
4259                            prefix=self.get_prefix(),
4260                            dir=self.get_tmp_dir(),
4261                            suffix=".hdr",
4262                            delete=False,
4263                        )
4264                        tmp_header_vcf_name = tmp_header_vcf.name
4265                        tmp_files.append(tmp_header_vcf_name)
4266                        # Command
4267                        if db_hdr_file.endswith(".gz"):
4268                            command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4269                        else:
4270                            command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
4271                        # Run
4272                        run_parallel_commands([command_extract_header], 1)
4273
4274                        # Find chomosomes
4275                        log.debug("Find chromosomes ")
4276                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
4277                        sql_query_chromosomes_df = self.get_query_to_df(
4278                            sql_query_chromosomes
4279                        )
4280                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
4281
4282                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
4283
4284                        # BED columns in the annotation file
4285                        if db_file_type in ["bed"]:
4286                            annotation_infos = "CHROM,POS,POS," + annotation_infos
4287
4288                        for chrom in chomosomes_list:
4289
4290                            # Create BED on initial VCF
4291                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
4292                            tmp_bed = NamedTemporaryFile(
4293                                prefix=self.get_prefix(),
4294                                dir=self.get_tmp_dir(),
4295                                suffix=".bed",
4296                                delete=False,
4297                            )
4298                            tmp_bed_name = tmp_bed.name
4299                            tmp_files.append(tmp_bed_name)
4300
4301                            # Detecte regions
4302                            log.debug(
4303                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
4304                            )
4305                            window = 1000000
4306                            sql_query_intervals_for_bed = f"""
4307                                SELECT  \"#CHROM\",
4308                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
4309                                        \"POS\"+{window}
4310                                FROM {table_variants} as table_variants
4311                                WHERE table_variants.\"#CHROM\" = '{chrom}'
4312                            """
4313                            regions = self.conn.execute(
4314                                sql_query_intervals_for_bed
4315                            ).fetchall()
4316                            merged_regions = merge_regions(regions)
4317                            log.debug(
4318                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
4319                            )
4320
4321                            header = ["#CHROM", "START", "END"]
4322                            with open(tmp_bed_name, "w") as f:
4323                                # Write the header with tab delimiter
4324                                f.write("\t".join(header) + "\n")
4325                                for d in merged_regions:
4326                                    # Write each data row with tab delimiter
4327                                    f.write("\t".join(map(str, d)) + "\n")
4328
4329                            # Tmp files
4330                            tmp_annotation_vcf = NamedTemporaryFile(
4331                                prefix=self.get_prefix(),
4332                                dir=self.get_tmp_dir(),
4333                                suffix=".vcf.gz",
4334                                delete=False,
4335                            )
4336                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
4337                            tmp_files.append(tmp_annotation_vcf_name)
4338                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
4339                            tmp_annotation_vcf_name_err = (
4340                                tmp_annotation_vcf_name + ".err"
4341                            )
4342                            err_files.append(tmp_annotation_vcf_name_err)
4343
4344                            # Annotate Command
4345                            log.debug(
4346                                f"Annotation '{annotation}' - add bcftools command"
4347                            )
4348
4349                            # Command
4350                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
4351
4352                            # Add command
4353                            commands.append(command_annotate)
4354
4355            # if some commands
4356            if commands:
4357
4358                # Export VCF file
4359                self.export_variant_vcf(
4360                    vcf_file=tmp_vcf_name,
4361                    remove_info=True,
4362                    add_samples=False,
4363                    index=True,
4364                )
4365
4366                # Threads
4367                # calculate threads for annotated commands
4368                if commands:
4369                    threads_bcftools_annotate = round(threads / len(commands))
4370                else:
4371                    threads_bcftools_annotate = 1
4372
4373                if not threads_bcftools_annotate:
4374                    threads_bcftools_annotate = 1
4375
4376                # Add threads option to bcftools commands
4377                if threads_bcftools_annotate > 1:
4378                    commands_threaded = []
4379                    for command in commands:
4380                        commands_threaded.append(
4381                            command.replace(
4382                                f"{bcftools_bin_command} annotate ",
4383                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
4384                            )
4385                        )
4386                    commands = commands_threaded
4387
4388                # Command annotation multithreading
4389                log.debug(f"Annotation - Annotation commands: " + str(commands))
4390                log.info(
4391                    f"Annotation - Annotation multithreaded in "
4392                    + str(len(commands))
4393                    + " commands"
4394                )
4395
4396                run_parallel_commands(commands, threads)
4397
4398                # Merge
4399                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4400
4401                if tmp_ann_vcf_list_cmd:
4402
4403                    # Tmp file
4404                    tmp_annotate_vcf = NamedTemporaryFile(
4405                        prefix=self.get_prefix(),
4406                        dir=self.get_tmp_dir(),
4407                        suffix=".vcf.gz",
4408                        delete=True,
4409                    )
4410                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4411                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4412                    err_files.append(tmp_annotate_vcf_name_err)
4413
4414                    # Tmp file remove command
4415                    tmp_files_remove_command = ""
4416                    if tmp_files:
4417                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4418
4419                    # Command merge
4420                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4421                    log.info(
4422                        f"Annotation - Annotation merging "
4423                        + str(len(commands))
4424                        + " annotated files"
4425                    )
4426                    log.debug(f"Annotation - merge command: {merge_command}")
4427                    run_parallel_commands([merge_command], 1)
4428
4429                    # Error messages
4430                    log.info(f"Error/Warning messages:")
4431                    error_message_command_all = []
4432                    error_message_command_warning = []
4433                    error_message_command_err = []
4434                    for err_file in err_files:
4435                        with open(err_file, "r") as f:
4436                            for line in f:
4437                                message = line.strip()
4438                                error_message_command_all.append(message)
4439                                if line.startswith("[W::"):
4440                                    error_message_command_warning.append(message)
4441                                if line.startswith("[E::"):
4442                                    error_message_command_err.append(
4443                                        f"{err_file}: " + message
4444                                    )
4445                    # log info
4446                    for message in list(
4447                        set(error_message_command_err + error_message_command_warning)
4448                    ):
4449                        log.info(f"   {message}")
4450                    # debug info
4451                    for message in list(set(error_message_command_all)):
4452                        log.debug(f"   {message}")
4453                    # failed
4454                    if len(error_message_command_err):
4455                        log.error("Annotation failed: Error in commands")
4456                        raise ValueError("Annotation failed: Error in commands")
4457
4458                    # Update variants
4459                    log.info(f"Annotation - Updating...")
4460                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4462    def annotation_exomiser(self, threads: int = None) -> None:
4463        """
4464        This function annotate with Exomiser
4465
4466        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4467        - "analysis" (dict/file):
4468            Full analysis dictionnary parameters (see Exomiser docs).
4469            Either a dict, or a file in JSON or YAML format.
4470            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4471            Default : None
4472        - "preset" (string):
4473            Analysis preset (available in config folder).
4474            Used if no full "analysis" is provided.
4475            Default: "exome"
4476        - "phenopacket" (dict/file):
4477            Samples and phenotipic features parameters (see Exomiser docs).
4478            Either a dict, or a file in JSON or YAML format.
4479            Default: None
4480        - "subject" (dict):
4481            Sample parameters (see Exomiser docs).
4482            Example:
4483                "subject":
4484                    {
4485                        "id": "ISDBM322017",
4486                        "sex": "FEMALE"
4487                    }
4488            Default: None
4489        - "sample" (string):
4490            Sample name to construct "subject" section:
4491                "subject":
4492                    {
4493                        "id": "<sample>",
4494                        "sex": "UNKNOWN_SEX"
4495                    }
4496            Default: None
4497        - "phenotypicFeatures" (dict)
4498            Phenotypic features to construct "subject" section.
4499            Example:
4500                "phenotypicFeatures":
4501                    [
4502                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4503                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4504                    ]
4505        - "hpo" (list)
4506            List of HPO ids as phenotypic features.
4507            Example:
4508                "hpo": ['0001156', '0001363', '0011304', '0010055']
4509            Default: []
4510        - "outputOptions" (dict):
4511            Output options (see Exomiser docs).
4512            Default:
4513                "output_options" =
4514                    {
4515                        "outputContributingVariantsOnly": False,
4516                        "numGenes": 0,
4517                        "outputFormats": ["TSV_VARIANT", "VCF"]
4518                    }
4519        - "transcript_source" (string):
4520            Transcript source (either "refseq", "ucsc", "ensembl")
4521            Default: "refseq"
4522        - "exomiser_to_info" (boolean):
4523            Add exomiser TSV file columns as INFO fields in VCF.
4524            Default: False
4525        - "release" (string):
4526            Exomise database release.
4527            If not exists, database release will be downloaded (take a while).
4528            Default: None (provided by application.properties configuration file)
4529        - "exomiser_application_properties" (file):
4530            Exomiser configuration file (see Exomiser docs).
4531            Useful to automatically download databases (especially for specific genome databases).
4532
4533        Notes:
4534        - If no sample in parameters, first sample in VCF will be chosen
4535        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4536
4537        :param threads: The number of threads to use
4538        :return: None.
4539        """
4540
4541        # DEBUG
4542        log.debug("Start annotation with Exomiser databases")
4543
4544        # Threads
4545        if not threads:
4546            threads = self.get_threads()
4547        log.debug("Threads: " + str(threads))
4548
4549        # Config
4550        config = self.get_config()
4551        log.debug("Config: " + str(config))
4552
4553        # Config - Folders - Databases
4554        databases_folders = (
4555            config.get("folders", {})
4556            .get("databases", {})
4557            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4558        )
4559        databases_folders = full_path(databases_folders)
4560        if not os.path.exists(databases_folders):
4561            log.error(f"Databases annotations: {databases_folders} NOT found")
4562        log.debug("Databases annotations: " + str(databases_folders))
4563
4564        # Config - Exomiser
4565        exomiser_bin_command = get_bin_command(
4566            bin="exomiser-cli*.jar",
4567            tool="exomiser",
4568            bin_type="jar",
4569            config=config,
4570            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4571        )
4572        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4573        if not exomiser_bin_command:
4574            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4575            log.error(msg_err)
4576            raise ValueError(msg_err)
4577
4578        # Param
4579        param = self.get_param()
4580        log.debug("Param: " + str(param))
4581
4582        # Param - Exomiser
4583        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4584        log.debug(f"Param Exomiser: {param_exomiser}")
4585
4586        # Param - Assembly
4587        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4588        log.debug("Assembly: " + str(assembly))
4589
4590        # Data
4591        table_variants = self.get_table_variants()
4592
4593        # Check if not empty
4594        log.debug("Check if not empty")
4595        sql_query_chromosomes = (
4596            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4597        )
4598        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4599            log.info(f"VCF empty")
4600            return False
4601
4602        # VCF header
4603        vcf_reader = self.get_header()
4604        log.debug("Initial header: " + str(vcf_reader.infos))
4605
4606        # Samples
4607        samples = self.get_header_sample_list()
4608        if not samples:
4609            log.error("No Samples in VCF")
4610            return False
4611        log.debug(f"Samples: {samples}")
4612
4613        # Memory limit
4614        memory_limit = self.get_memory("8G")
4615        log.debug(f"memory_limit: {memory_limit}")
4616
4617        # Exomiser java options
4618        exomiser_java_options = (
4619            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4620        )
4621        log.debug(f"Exomiser java options: {exomiser_java_options}")
4622
4623        # Download Exomiser (if not exists)
4624        exomiser_release = param_exomiser.get("release", None)
4625        exomiser_application_properties = param_exomiser.get(
4626            "exomiser_application_properties", None
4627        )
4628        databases_download_exomiser(
4629            assemblies=[assembly],
4630            exomiser_folder=databases_folders,
4631            exomiser_release=exomiser_release,
4632            exomiser_phenotype_release=exomiser_release,
4633            exomiser_application_properties=exomiser_application_properties,
4634        )
4635
4636        # Force annotation
4637        force_update_annotation = True
4638
4639        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4640            log.debug("Start annotation Exomiser")
4641
4642            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4643
4644                # tmp_dir = "/tmp/exomiser"
4645
4646                ### ANALYSIS ###
4647                ################
4648
4649                # Create analysis.json through analysis dict
4650                # either analysis in param or by default
4651                # depending on preset exome/genome)
4652
4653                # Init analysis dict
4654                param_exomiser_analysis_dict = {}
4655
4656                # analysis from param
4657                param_exomiser_analysis = param_exomiser.get("analysis", {})
4658                param_exomiser_analysis = full_path(param_exomiser_analysis)
4659
4660                # If analysis in param -> load anlaysis json
4661                if param_exomiser_analysis:
4662
4663                    # If param analysis is a file and exists
4664                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4665                        param_exomiser_analysis
4666                    ):
4667                        # Load analysis file into analysis dict (either yaml or json)
4668                        with open(param_exomiser_analysis) as json_file:
4669                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4670
4671                    # If param analysis is a dict
4672                    elif isinstance(param_exomiser_analysis, dict):
4673                        # Load analysis dict into analysis dict (either yaml or json)
4674                        param_exomiser_analysis_dict = param_exomiser_analysis
4675
4676                    # Error analysis type
4677                    else:
4678                        log.error(f"Analysis type unknown. Check param file.")
4679                        raise ValueError(f"Analysis type unknown. Check param file.")
4680
4681                # Case no input analysis config file/dict
4682                # Use preset (exome/genome) to open default config file
4683                if not param_exomiser_analysis_dict:
4684
4685                    # default preset
4686                    default_preset = "exome"
4687
4688                    # Get param preset or default preset
4689                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4690
4691                    # Try to find if preset is a file
4692                    if os.path.exists(param_exomiser_preset):
4693                        # Preset file is provided in full path
4694                        param_exomiser_analysis_default_config_file = (
4695                            param_exomiser_preset
4696                        )
4697                    # elif os.path.exists(full_path(param_exomiser_preset)):
4698                    #     # Preset file is provided in full path
4699                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4700                    elif os.path.exists(
4701                        os.path.join(folder_config, param_exomiser_preset)
4702                    ):
4703                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4704                        param_exomiser_analysis_default_config_file = os.path.join(
4705                            folder_config, param_exomiser_preset
4706                        )
4707                    else:
4708                        # Construct preset file
4709                        param_exomiser_analysis_default_config_file = os.path.join(
4710                            folder_config,
4711                            f"preset-{param_exomiser_preset}-analysis.json",
4712                        )
4713
4714                    # If preset file exists
4715                    param_exomiser_analysis_default_config_file = full_path(
4716                        param_exomiser_analysis_default_config_file
4717                    )
4718                    if os.path.exists(param_exomiser_analysis_default_config_file):
4719                        # Load prest file into analysis dict (either yaml or json)
4720                        with open(
4721                            param_exomiser_analysis_default_config_file
4722                        ) as json_file:
4723                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4724                                json_file
4725                            )
4726
4727                    # Error preset file
4728                    else:
4729                        log.error(
4730                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4731                        )
4732                        raise ValueError(
4733                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4734                        )
4735
4736                # If no analysis dict created
4737                if not param_exomiser_analysis_dict:
4738                    log.error(f"No analysis config")
4739                    raise ValueError(f"No analysis config")
4740
4741                # Log
4742                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4743
4744                ### PHENOPACKET ###
4745                ###################
4746
4747                # If no PhenoPacket in analysis dict -> check in param
4748                if "phenopacket" not in param_exomiser_analysis_dict:
4749
4750                    # If PhenoPacket in param -> load anlaysis json
4751                    if param_exomiser.get("phenopacket", None):
4752
4753                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4754                        param_exomiser_phenopacket = full_path(
4755                            param_exomiser_phenopacket
4756                        )
4757
4758                        # If param phenopacket is a file and exists
4759                        if isinstance(
4760                            param_exomiser_phenopacket, str
4761                        ) and os.path.exists(param_exomiser_phenopacket):
4762                            # Load phenopacket file into analysis dict (either yaml or json)
4763                            with open(param_exomiser_phenopacket) as json_file:
4764                                param_exomiser_analysis_dict["phenopacket"] = (
4765                                    yaml.safe_load(json_file)
4766                                )
4767
4768                        # If param phenopacket is a dict
4769                        elif isinstance(param_exomiser_phenopacket, dict):
4770                            # Load phenopacket dict into analysis dict (either yaml or json)
4771                            param_exomiser_analysis_dict["phenopacket"] = (
4772                                param_exomiser_phenopacket
4773                            )
4774
4775                        # Error phenopacket type
4776                        else:
4777                            log.error(f"Phenopacket type unknown. Check param file.")
4778                            raise ValueError(
4779                                f"Phenopacket type unknown. Check param file."
4780                            )
4781
4782                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4783                if "phenopacket" not in param_exomiser_analysis_dict:
4784
4785                    # Init PhenoPacket
4786                    param_exomiser_analysis_dict["phenopacket"] = {
4787                        "id": "analysis",
4788                        "proband": {},
4789                    }
4790
4791                    ### Add subject ###
4792
4793                    # If subject exists
4794                    param_exomiser_subject = param_exomiser.get("subject", {})
4795
4796                    # If subject not exists -> found sample ID
4797                    if not param_exomiser_subject:
4798
4799                        # Found sample ID in param
4800                        sample = param_exomiser.get("sample", None)
4801
4802                        # Find sample ID (first sample)
4803                        if not sample:
4804                            sample_list = self.get_header_sample_list()
4805                            if len(sample_list) > 0:
4806                                sample = sample_list[0]
4807                            else:
4808                                log.error(f"No sample found")
4809                                raise ValueError(f"No sample found")
4810
4811                        # Create subject
4812                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4813
4814                    # Add to dict
4815                    param_exomiser_analysis_dict["phenopacket"][
4816                        "subject"
4817                    ] = param_exomiser_subject
4818
4819                    ### Add "phenotypicFeatures" ###
4820
4821                    # If phenotypicFeatures exists
4822                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4823                        "phenotypicFeatures", []
4824                    )
4825
4826                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4827                    if not param_exomiser_phenotypicfeatures:
4828
4829                        # Found HPO in param
4830                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4831
4832                        # Split HPO if list in string format separated by comma
4833                        if isinstance(param_exomiser_hpo, str):
4834                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4835
4836                        # Create HPO list
4837                        for hpo in param_exomiser_hpo:
4838                            hpo_clean = re.sub("[^0-9]", "", hpo)
4839                            param_exomiser_phenotypicfeatures.append(
4840                                {
4841                                    "type": {
4842                                        "id": f"HP:{hpo_clean}",
4843                                        "label": f"HP:{hpo_clean}",
4844                                    }
4845                                }
4846                            )
4847
4848                    # Add to dict
4849                    param_exomiser_analysis_dict["phenopacket"][
4850                        "phenotypicFeatures"
4851                    ] = param_exomiser_phenotypicfeatures
4852
4853                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4854                    if not param_exomiser_phenotypicfeatures:
4855                        for step in param_exomiser_analysis_dict.get(
4856                            "analysis", {}
4857                        ).get("steps", []):
4858                            if "hiPhivePrioritiser" in step:
4859                                param_exomiser_analysis_dict.get("analysis", {}).get(
4860                                    "steps", []
4861                                ).remove(step)
4862
4863                ### Add Input File ###
4864
4865                # Initial file name and htsFiles
4866                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4867                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4868                    {
4869                        "uri": tmp_vcf_name,
4870                        "htsFormat": "VCF",
4871                        "genomeAssembly": assembly,
4872                    }
4873                ]
4874
4875                ### Add metaData ###
4876
4877                # If metaData not in analysis dict
4878                if "metaData" not in param_exomiser_analysis_dict:
4879                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4880                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4881                        "createdBy": "howard",
4882                        "phenopacketSchemaVersion": 1,
4883                    }
4884
4885                ### OutputOptions ###
4886
4887                # Init output result folder
4888                output_results = os.path.join(tmp_dir, "results")
4889
4890                # If no outputOptions in analysis dict
4891                if "outputOptions" not in param_exomiser_analysis_dict:
4892
4893                    # default output formats
4894                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4895
4896                    # Get outputOptions in param
4897                    output_options = param_exomiser.get("outputOptions", None)
4898
4899                    # If no output_options in param -> check
4900                    if not output_options:
4901                        output_options = {
4902                            "outputContributingVariantsOnly": False,
4903                            "numGenes": 0,
4904                            "outputFormats": defaut_output_formats,
4905                        }
4906
4907                    # Replace outputDirectory in output options
4908                    output_options["outputDirectory"] = output_results
4909                    output_options["outputFileName"] = "howard"
4910
4911                    # Add outputOptions in analysis dict
4912                    param_exomiser_analysis_dict["outputOptions"] = output_options
4913
4914                else:
4915
4916                    # Replace output_results and output format (if exists in param)
4917                    param_exomiser_analysis_dict["outputOptions"][
4918                        "outputDirectory"
4919                    ] = output_results
4920                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4921                        list(
4922                            set(
4923                                param_exomiser_analysis_dict.get(
4924                                    "outputOptions", {}
4925                                ).get("outputFormats", [])
4926                                + ["TSV_VARIANT", "VCF"]
4927                            )
4928                        )
4929                    )
4930
4931                # log
4932                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4933
4934                ### ANALYSIS FILE ###
4935                #####################
4936
4937                ### Full JSON analysis config file ###
4938
4939                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4940                with open(exomiser_analysis, "w") as fp:
4941                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4942
4943                ### SPLIT analysis and sample config files
4944
4945                # Splitted analysis dict
4946                param_exomiser_analysis_dict_for_split = (
4947                    param_exomiser_analysis_dict.copy()
4948                )
4949
4950                # Phenopacket JSON file
4951                exomiser_analysis_phenopacket = os.path.join(
4952                    tmp_dir, "analysis_phenopacket.json"
4953                )
4954                with open(exomiser_analysis_phenopacket, "w") as fp:
4955                    json.dump(
4956                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4957                        fp,
4958                        indent=4,
4959                    )
4960
4961                # Analysis JSON file without Phenopacket parameters
4962                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4963                exomiser_analysis_analysis = os.path.join(
4964                    tmp_dir, "analysis_analysis.json"
4965                )
4966                with open(exomiser_analysis_analysis, "w") as fp:
4967                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4968
4969                ### INITAL VCF file ###
4970                #######################
4971
4972                ### Create list of samples to use and include inti initial VCF file ####
4973
4974                # Subject (main sample)
4975                # Get sample ID in analysis dict
4976                sample_subject = (
4977                    param_exomiser_analysis_dict.get("phenopacket", {})
4978                    .get("subject", {})
4979                    .get("id", None)
4980                )
4981                sample_proband = (
4982                    param_exomiser_analysis_dict.get("phenopacket", {})
4983                    .get("proband", {})
4984                    .get("subject", {})
4985                    .get("id", None)
4986                )
4987                sample = []
4988                if sample_subject:
4989                    sample.append(sample_subject)
4990                if sample_proband:
4991                    sample.append(sample_proband)
4992
4993                # Get sample ID within Pedigree
4994                pedigree_persons_list = (
4995                    param_exomiser_analysis_dict.get("phenopacket", {})
4996                    .get("pedigree", {})
4997                    .get("persons", {})
4998                )
4999
5000                # Create list with all sample ID in pedigree (if exists)
5001                pedigree_persons = []
5002                for person in pedigree_persons_list:
5003                    pedigree_persons.append(person.get("individualId"))
5004
5005                # Concat subject sample ID and samples ID in pedigreesamples
5006                samples = list(set(sample + pedigree_persons))
5007
5008                # Check if sample list is not empty
5009                if not samples:
5010                    log.error(f"No samples found")
5011                    raise ValueError(f"No samples found")
5012
5013                # Create VCF with sample (either sample in param or first one by default)
5014                # Export VCF file
5015                self.export_variant_vcf(
5016                    vcf_file=tmp_vcf_name,
5017                    remove_info=True,
5018                    add_samples=True,
5019                    list_samples=samples,
5020                    index=False,
5021                )
5022
5023                ### Execute Exomiser ###
5024                ########################
5025
5026                # Init command
5027                exomiser_command = ""
5028
5029                # Command exomiser options
5030                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
5031
5032                # Release
5033                exomiser_release = param_exomiser.get("release", None)
5034                if exomiser_release:
5035                    # phenotype data version
5036                    exomiser_options += (
5037                        f" --exomiser.phenotype.data-version={exomiser_release} "
5038                    )
5039                    # data version
5040                    exomiser_options += (
5041                        f" --exomiser.{assembly}.data-version={exomiser_release} "
5042                    )
5043                    # variant white list
5044                    variant_white_list_file = (
5045                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
5046                    )
5047                    if os.path.exists(
5048                        os.path.join(
5049                            databases_folders, assembly, variant_white_list_file
5050                        )
5051                    ):
5052                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
5053
5054                # transcript_source
5055                transcript_source = param_exomiser.get(
5056                    "transcript_source", None
5057                )  # ucsc, refseq, ensembl
5058                if transcript_source:
5059                    exomiser_options += (
5060                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
5061                    )
5062
5063                # If analysis contain proband param
5064                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
5065                    "proband", {}
5066                ):
5067                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
5068
5069                # If no proband (usually uniq sample)
5070                else:
5071                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
5072
5073                # Log
5074                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
5075
5076                # Run command
5077                result = subprocess.call(
5078                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
5079                )
5080                if result:
5081                    log.error("Exomiser command failed")
5082                    raise ValueError("Exomiser command failed")
5083
5084                ### RESULTS ###
5085                ###############
5086
5087                ### Annotate with TSV fields ###
5088
5089                # Init result tsv file
5090                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
5091
5092                # Init result tsv file
5093                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
5094
5095                # Parse TSV file and explode columns in INFO field
5096                if exomiser_to_info and os.path.exists(output_results_tsv):
5097
5098                    # Log
5099                    log.debug("Exomiser columns to VCF INFO field")
5100
5101                    # Retrieve columns and types
5102                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
5103                    output_results_tsv_df = self.get_query_to_df(query)
5104                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
5105
5106                    # Init concat fields for update
5107                    sql_query_update_concat_fields = []
5108
5109                    # Fields to avoid
5110                    fields_to_avoid = [
5111                        "CONTIG",
5112                        "START",
5113                        "END",
5114                        "REF",
5115                        "ALT",
5116                        "QUAL",
5117                        "FILTER",
5118                        "GENOTYPE",
5119                    ]
5120
5121                    # List all columns to add into header
5122                    for header_column in output_results_tsv_columns:
5123
5124                        # If header column is enable
5125                        if header_column not in fields_to_avoid:
5126
5127                            # Header info type
5128                            header_info_type = "String"
5129                            header_column_df = output_results_tsv_df[header_column]
5130                            header_column_df_dtype = header_column_df.dtype
5131                            if header_column_df_dtype == object:
5132                                if (
5133                                    pd.to_numeric(header_column_df, errors="coerce")
5134                                    .notnull()
5135                                    .all()
5136                                ):
5137                                    header_info_type = "Float"
5138                            else:
5139                                header_info_type = "Integer"
5140
5141                            # Header info
5142                            characters_to_validate = ["-"]
5143                            pattern = "[" + "".join(characters_to_validate) + "]"
5144                            header_info_name = re.sub(
5145                                pattern,
5146                                "_",
5147                                f"Exomiser_{header_column}".replace("#", ""),
5148                            )
5149                            header_info_number = "."
5150                            header_info_description = (
5151                                f"Exomiser {header_column} annotation"
5152                            )
5153                            header_info_source = "Exomiser"
5154                            header_info_version = "unknown"
5155                            header_info_code = CODE_TYPE_MAP[header_info_type]
5156                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
5157                                header_info_name,
5158                                header_info_number,
5159                                header_info_type,
5160                                header_info_description,
5161                                header_info_source,
5162                                header_info_version,
5163                                header_info_code,
5164                            )
5165
5166                            # Add field to add for update to concat fields
5167                            sql_query_update_concat_fields.append(
5168                                f"""
5169                                CASE
5170                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
5171                                    THEN concat(
5172                                        '{header_info_name}=',
5173                                        table_parquet."{header_column}",
5174                                        ';'
5175                                        )
5176
5177                                    ELSE ''
5178                                END
5179                            """
5180                            )
5181
5182                    # Update query
5183                    sql_query_update = f"""
5184                        UPDATE {table_variants} as table_variants
5185                            SET INFO = concat(
5186                                            CASE
5187                                                WHEN INFO NOT IN ('', '.')
5188                                                THEN INFO
5189                                                ELSE ''
5190                                            END,
5191                                            CASE
5192                                                WHEN table_variants.INFO NOT IN ('','.')
5193                                                THEN ';'
5194                                                ELSE ''
5195                                            END,
5196                                            (
5197                                            SELECT 
5198                                                concat(
5199                                                    {",".join(sql_query_update_concat_fields)}
5200                                                )
5201                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
5202                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
5203                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
5204                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5205                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5206                                            )
5207                                        )
5208                            ;
5209                        """
5210
5211                    # Update
5212                    self.conn.execute(sql_query_update)
5213
5214                ### Annotate with VCF INFO field ###
5215
5216                # Init result VCF file
5217                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
5218
5219                # If VCF exists
5220                if os.path.exists(output_results_vcf):
5221
5222                    # Log
5223                    log.debug("Exomiser result VCF update variants")
5224
5225                    # Find Exomiser INFO field annotation in header
5226                    with gzip.open(output_results_vcf, "rt") as f:
5227                        header_list = self.read_vcf_header(f)
5228                    exomiser_vcf_header = vcf.Reader(
5229                        io.StringIO("\n".join(header_list))
5230                    )
5231
5232                    # Add annotation INFO field to header
5233                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
5234
5235                    # Update variants with VCF
5236                    self.update_from_vcf(output_results_vcf)
5237
5238        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
5240    def annotation_snpeff(self, threads: int = None) -> None:
5241        """
5242        This function annotate with snpEff
5243
5244        :param threads: The number of threads to use
5245        :return: the value of the variable "return_value".
5246        """
5247
5248        # DEBUG
5249        log.debug("Start annotation with snpeff databases")
5250
5251        # Threads
5252        if not threads:
5253            threads = self.get_threads()
5254        log.debug("Threads: " + str(threads))
5255
5256        # DEBUG
5257        delete_tmp = True
5258        if self.get_config().get("verbosity", "warning") in ["debug"]:
5259            delete_tmp = False
5260            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5261
5262        # Config
5263        config = self.get_config()
5264        log.debug("Config: " + str(config))
5265
5266        # Config - Folders - Databases
5267        databases_folders = (
5268            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
5269        )
5270        log.debug("Databases annotations: " + str(databases_folders))
5271
5272        # Config - snpEff bin command
5273        snpeff_bin_command = get_bin_command(
5274            bin="snpEff.jar",
5275            tool="snpeff",
5276            bin_type="jar",
5277            config=config,
5278            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
5279        )
5280        if not snpeff_bin_command:
5281            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
5282            log.error(msg_err)
5283            raise ValueError(msg_err)
5284
5285        # Config - snpEff databases
5286        snpeff_databases = (
5287            config.get("folders", {})
5288            .get("databases", {})
5289            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
5290        )
5291        snpeff_databases = full_path(snpeff_databases)
5292        if snpeff_databases is not None and snpeff_databases != "":
5293            log.debug(f"Create snpEff databases folder")
5294            if not os.path.exists(snpeff_databases):
5295                os.makedirs(snpeff_databases)
5296
5297        # Param
5298        param = self.get_param()
5299        log.debug("Param: " + str(param))
5300
5301        # Param
5302        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
5303        log.debug("Options: " + str(options))
5304
5305        # Param - Assembly
5306        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5307
5308        # Param - Options
5309        snpeff_options = (
5310            param.get("annotation", {}).get("snpeff", {}).get("options", "")
5311        )
5312        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
5313        snpeff_csvstats = (
5314            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
5315        )
5316        if snpeff_stats:
5317            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
5318            snpeff_stats = full_path(snpeff_stats)
5319            snpeff_options += f" -stats {snpeff_stats}"
5320        if snpeff_csvstats:
5321            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
5322            snpeff_csvstats = full_path(snpeff_csvstats)
5323            snpeff_options += f" -csvStats {snpeff_csvstats}"
5324
5325        # Data
5326        table_variants = self.get_table_variants()
5327
5328        # Check if not empty
5329        log.debug("Check if not empty")
5330        sql_query_chromosomes = (
5331            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5332        )
5333        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
5334        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5335            log.info(f"VCF empty")
5336            return
5337
5338        # Export in VCF
5339        log.debug("Create initial file to annotate")
5340        tmp_vcf = NamedTemporaryFile(
5341            prefix=self.get_prefix(),
5342            dir=self.get_tmp_dir(),
5343            suffix=".vcf.gz",
5344            delete=True,
5345        )
5346        tmp_vcf_name = tmp_vcf.name
5347
5348        # VCF header
5349        vcf_reader = self.get_header()
5350        log.debug("Initial header: " + str(vcf_reader.infos))
5351
5352        # Existing annotations
5353        for vcf_annotation in self.get_header().infos:
5354
5355            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5356            log.debug(
5357                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5358            )
5359
5360        # Memory limit
5361        # if config.get("memory", None):
5362        #     memory_limit = config.get("memory", "8G")
5363        # else:
5364        #     memory_limit = "8G"
5365        memory_limit = self.get_memory("8G")
5366        log.debug(f"memory_limit: {memory_limit}")
5367
5368        # snpEff java options
5369        snpeff_java_options = (
5370            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
5371        )
5372        log.debug(f"Exomiser java options: {snpeff_java_options}")
5373
5374        force_update_annotation = True
5375
5376        if "ANN" not in self.get_header().infos or force_update_annotation:
5377
5378            # Check snpEff database
5379            log.debug(f"Check snpEff databases {[assembly]}")
5380            databases_download_snpeff(
5381                folder=snpeff_databases, assemblies=[assembly], config=config
5382            )
5383
5384            # Export VCF file
5385            self.export_variant_vcf(
5386                vcf_file=tmp_vcf_name,
5387                remove_info=True,
5388                add_samples=False,
5389                index=True,
5390            )
5391
5392            # Tmp file
5393            err_files = []
5394            tmp_annotate_vcf = NamedTemporaryFile(
5395                prefix=self.get_prefix(),
5396                dir=self.get_tmp_dir(),
5397                suffix=".vcf",
5398                delete=False,
5399            )
5400            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5401            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5402            err_files.append(tmp_annotate_vcf_name_err)
5403
5404            # Command
5405            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5406            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5407            run_parallel_commands([snpeff_command], 1)
5408
5409            # Error messages
5410            log.info(f"Error/Warning messages:")
5411            error_message_command_all = []
5412            error_message_command_warning = []
5413            error_message_command_err = []
5414            for err_file in err_files:
5415                with open(err_file, "r") as f:
5416                    for line in f:
5417                        message = line.strip()
5418                        error_message_command_all.append(message)
5419                        if line.startswith("[W::"):
5420                            error_message_command_warning.append(message)
5421                        if line.startswith("[E::"):
5422                            error_message_command_err.append(f"{err_file}: " + message)
5423            # log info
5424            for message in list(
5425                set(error_message_command_err + error_message_command_warning)
5426            ):
5427                log.info(f"   {message}")
5428            # debug info
5429            for message in list(set(error_message_command_all)):
5430                log.debug(f"   {message}")
5431            # failed
5432            if len(error_message_command_err):
5433                log.error("Annotation failed: Error in commands")
5434                raise ValueError("Annotation failed: Error in commands")
5435
5436            # Find annotation in header
5437            with open(tmp_annotate_vcf_name, "rt") as f:
5438                header_list = self.read_vcf_header(f)
5439            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5440
5441            for ann in annovar_vcf_header.infos:
5442                if ann not in self.get_header().infos:
5443                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5444
5445            # Update variants
5446            log.info(f"Annotation - Updating...")
5447            self.update_from_vcf(tmp_annotate_vcf_name)
5448
5449        else:
5450            if "ANN" in self.get_header().infos:
5451                log.debug(f"Existing snpEff annotations in VCF")
5452            if force_update_annotation:
5453                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5455    def annotation_annovar(self, threads: int = None) -> None:
5456        """
5457        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5458        annotations
5459
5460        :param threads: number of threads to use
5461        :return: the value of the variable "return_value".
5462        """
5463
5464        # DEBUG
5465        log.debug("Start annotation with Annovar databases")
5466
5467        # Threads
5468        if not threads:
5469            threads = self.get_threads()
5470        log.debug("Threads: " + str(threads))
5471
5472        # Tmp en Err files
5473        tmp_files = []
5474        err_files = []
5475
5476        # DEBUG
5477        delete_tmp = True
5478        if self.get_config().get("verbosity", "warning") in ["debug"]:
5479            delete_tmp = False
5480            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5481
5482        # Config
5483        config = self.get_config()
5484        log.debug("Config: " + str(config))
5485
5486        # Config - Folders - Databases
5487        databases_folders = (
5488            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5489        )
5490        log.debug("Databases annotations: " + str(databases_folders))
5491
5492        # Config - annovar bin command
5493        annovar_bin_command = get_bin_command(
5494            bin="table_annovar.pl",
5495            tool="annovar",
5496            bin_type="perl",
5497            config=config,
5498            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5499        )
5500        if not annovar_bin_command:
5501            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5502            log.error(msg_err)
5503            raise ValueError(msg_err)
5504
5505        # Config - BCFTools bin command
5506        bcftools_bin_command = get_bin_command(
5507            bin="bcftools",
5508            tool="bcftools",
5509            bin_type="bin",
5510            config=config,
5511            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5512        )
5513        if not bcftools_bin_command:
5514            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5515            log.error(msg_err)
5516            raise ValueError(msg_err)
5517
5518        # Config - annovar databases
5519        annovar_databases = (
5520            config.get("folders", {})
5521            .get("databases", {})
5522            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5523        )
5524        if annovar_databases is not None:
5525            if isinstance(annovar_databases, list):
5526                annovar_databases = full_path(annovar_databases[0])
5527                log.warning(f"Annovar databases folder '{annovar_databases}' selected")
5528            annovar_databases = full_path(annovar_databases)
5529            if not os.path.exists(annovar_databases):
5530                log.info(f"Annovar databases folder '{annovar_databases}' created")
5531                Path(annovar_databases).mkdir(parents=True, exist_ok=True)
5532        else:
5533            msg_err = f"Annovar databases configuration failed"
5534            log.error(msg_err)
5535            raise ValueError(msg_err)
5536
5537        # Param
5538        param = self.get_param()
5539        log.debug("Param: " + str(param))
5540
5541        # Param - options
5542        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5543        log.debug("Options: " + str(options))
5544
5545        # Param - annotations
5546        annotations = (
5547            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5548        )
5549        log.debug("Annotations: " + str(annotations))
5550
5551        # Param - Assembly
5552        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5553
5554        # Annovar database assembly
5555        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5556        if annovar_databases_assembly != "" and not os.path.exists(
5557            annovar_databases_assembly
5558        ):
5559            os.makedirs(annovar_databases_assembly)
5560
5561        # Data
5562        table_variants = self.get_table_variants()
5563
5564        # Check if not empty
5565        log.debug("Check if not empty")
5566        sql_query_chromosomes = (
5567            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5568        )
5569        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5570        if not sql_query_chromosomes_df["count"][0]:
5571            log.info(f"VCF empty")
5572            return
5573
5574        # VCF header
5575        vcf_reader = self.get_header()
5576        log.debug("Initial header: " + str(vcf_reader.infos))
5577
5578        # Existing annotations
5579        for vcf_annotation in self.get_header().infos:
5580
5581            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5582            log.debug(
5583                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5584            )
5585
5586        force_update_annotation = True
5587
5588        if annotations:
5589
5590            commands = []
5591            tmp_annotates_vcf_name_list = []
5592
5593            # Export in VCF
5594            log.debug("Create initial file to annotate")
5595            tmp_vcf = NamedTemporaryFile(
5596                prefix=self.get_prefix(),
5597                dir=self.get_tmp_dir(),
5598                suffix=".vcf.gz",
5599                delete=False,
5600            )
5601            tmp_vcf_name = tmp_vcf.name
5602            tmp_files.append(tmp_vcf_name)
5603            tmp_files.append(tmp_vcf_name + ".tbi")
5604
5605            # Export VCF file
5606            self.export_variant_vcf(
5607                vcf_file=tmp_vcf_name,
5608                remove_info=".",
5609                add_samples=False,
5610                index=True,
5611            )
5612
5613            # Create file for field rename
5614            log.debug("Create file for field rename")
5615            tmp_rename = NamedTemporaryFile(
5616                prefix=self.get_prefix(),
5617                dir=self.get_tmp_dir(),
5618                suffix=".rename",
5619                delete=False,
5620            )
5621            tmp_rename_name = tmp_rename.name
5622            tmp_files.append(tmp_rename_name)
5623
5624            # Check Annovar database
5625            log.debug(
5626                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5627            )
5628            databases_download_annovar(
5629                folder=annovar_databases,
5630                files=list(annotations.keys()),
5631                assemblies=[assembly],
5632            )
5633
5634            for annotation in annotations:
5635                annotation_fields = annotations[annotation]
5636
5637                if not annotation_fields:
5638                    annotation_fields = {"INFO": None}
5639
5640                log.info(f"Annotations Annovar - database '{annotation}'")
5641                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5642
5643                # Tmp file for annovar
5644                err_files = []
5645                tmp_annotate_vcf_directory = TemporaryDirectory(
5646                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5647                )
5648                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5649                tmp_annotate_vcf_name_annovar = (
5650                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5651                )
5652                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5653                err_files.append(tmp_annotate_vcf_name_err)
5654                tmp_files.append(tmp_annotate_vcf_name_err)
5655
5656                # Tmp file final vcf annotated by annovar
5657                tmp_annotate_vcf = NamedTemporaryFile(
5658                    prefix=self.get_prefix(),
5659                    dir=self.get_tmp_dir(),
5660                    suffix=".vcf.gz",
5661                    delete=False,
5662                )
5663                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5664                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5665                tmp_files.append(tmp_annotate_vcf_name)
5666                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5667
5668                # Number of fields
5669                annotation_list = []
5670                annotation_renamed_list = []
5671
5672                for annotation_field in annotation_fields:
5673
5674                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5675                    annotation_fields_new_name = annotation_fields.get(
5676                        annotation_field, annotation_field
5677                    )
5678                    if not annotation_fields_new_name:
5679                        annotation_fields_new_name = annotation_field
5680
5681                    if (
5682                        force_update_annotation
5683                        or annotation_fields_new_name not in self.get_header().infos
5684                    ):
5685                        annotation_list.append(annotation_field)
5686                        annotation_renamed_list.append(annotation_fields_new_name)
5687                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5688                        log.warning(
5689                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5690                        )
5691
5692                    # Add rename info
5693                    run_parallel_commands(
5694                        [
5695                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5696                        ],
5697                        1,
5698                    )
5699
5700                # log.debug("fields_to_removed: " + str(fields_to_removed))
5701                log.debug("annotation_list: " + str(annotation_list))
5702
5703                # protocol
5704                protocol = annotation
5705
5706                # argument
5707                argument = ""
5708
5709                # operation
5710                operation = "f"
5711                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5712                    "ensGene"
5713                ):
5714                    operation = "g"
5715                    if options.get("genebase", None):
5716                        argument = f"""'{options.get("genebase","")}'"""
5717                elif annotation in ["cytoBand"]:
5718                    operation = "r"
5719
5720                # argument option
5721                argument_option = ""
5722                if argument != "":
5723                    argument_option = " --argument " + argument
5724
5725                # command options
5726                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5727                for option in options:
5728                    if option not in ["genebase"]:
5729                        command_options += f""" --{option}={options[option]}"""
5730
5731                # Command
5732
5733                # Command - Annovar
5734                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5735                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5736
5737                # Command - start pipe
5738                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5739
5740                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5741                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5742
5743                # Command - Special characters (refGene annotation)
5744                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5745
5746                # Command - Clean empty fields (with value ".")
5747                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5748
5749                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5750                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5751                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5752                    # for ann in annotation_renamed_list:
5753                    for ann in annotation_list:
5754                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5755
5756                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5757
5758                # Command - indexing
5759                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5760
5761                log.debug(f"Annotation - Annovar command: {command_annovar}")
5762                run_parallel_commands([command_annovar], 1)
5763
5764                # Error messages
5765                log.info(f"Error/Warning messages:")
5766                error_message_command_all = []
5767                error_message_command_warning = []
5768                error_message_command_err = []
5769                for err_file in err_files:
5770                    with open(err_file, "r") as f:
5771                        for line in f:
5772                            message = line.strip()
5773                            error_message_command_all.append(message)
5774                            if line.startswith("[W::") or line.startswith("WARNING"):
5775                                error_message_command_warning.append(message)
5776                            if line.startswith("[E::") or line.startswith("ERROR"):
5777                                error_message_command_err.append(
5778                                    f"{err_file}: " + message
5779                                )
5780                # log info
5781                for message in list(
5782                    set(error_message_command_err + error_message_command_warning)
5783                ):
5784                    log.info(f"   {message}")
5785                # debug info
5786                for message in list(set(error_message_command_all)):
5787                    log.debug(f"   {message}")
5788                # failed
5789                if len(error_message_command_err):
5790                    log.error("Annotation failed: Error in commands")
5791                    raise ValueError("Annotation failed: Error in commands")
5792
5793            if tmp_annotates_vcf_name_list:
5794
5795                # List of annotated files
5796                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5797
5798                # Tmp file
5799                tmp_annotate_vcf = NamedTemporaryFile(
5800                    prefix=self.get_prefix(),
5801                    dir=self.get_tmp_dir(),
5802                    suffix=".vcf.gz",
5803                    delete=False,
5804                )
5805                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5806                tmp_files.append(tmp_annotate_vcf_name)
5807                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5808                err_files.append(tmp_annotate_vcf_name_err)
5809                tmp_files.append(tmp_annotate_vcf_name_err)
5810
5811                # Command merge
5812                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5813                log.info(
5814                    f"Annotation Annovar - Annotation merging "
5815                    + str(len(tmp_annotates_vcf_name_list))
5816                    + " annotated files"
5817                )
5818                log.debug(f"Annotation - merge command: {merge_command}")
5819                run_parallel_commands([merge_command], 1)
5820
5821                # Find annotation in header
5822                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5823                    header_list = self.read_vcf_header(f)
5824                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5825
5826                for ann in annovar_vcf_header.infos:
5827                    if ann not in self.get_header().infos:
5828                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5829
5830                # Update variants
5831                log.info(f"Annotation Annovar - Updating...")
5832                self.update_from_vcf(tmp_annotate_vcf_name)
5833
5834            # Clean files
5835            # Tmp file remove command
5836            if True:
5837                tmp_files_remove_command = ""
5838                if tmp_files:
5839                    tmp_files_remove_command = " ".join(tmp_files)
5840                clean_command = f" rm -f {tmp_files_remove_command} "
5841                log.debug(f"Annotation Annovar - Annotation cleaning ")
5842                log.debug(f"Annotation - cleaning command: {clean_command}")
5843                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5846    def annotation_parquet(self, threads: int = None) -> None:
5847        """
5848        It takes a VCF file, and annotates it with a parquet file
5849
5850        :param threads: number of threads to use for the annotation
5851        :return: the value of the variable "result".
5852        """
5853
5854        # DEBUG
5855        log.debug("Start annotation with parquet databases")
5856
5857        # Threads
5858        if not threads:
5859            threads = self.get_threads()
5860        log.debug("Threads: " + str(threads))
5861
5862        # DEBUG
5863        delete_tmp = True
5864        if self.get_config().get("verbosity", "warning") in ["debug"]:
5865            delete_tmp = False
5866            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5867
5868        # Config
5869        databases_folders = set(
5870            self.get_config()
5871            .get("folders", {})
5872            .get("databases", {})
5873            .get("annotations", ["."])
5874            + self.get_config()
5875            .get("folders", {})
5876            .get("databases", {})
5877            .get("parquet", ["."])
5878        )
5879        log.debug("Databases annotations: " + str(databases_folders))
5880
5881        # Param
5882        annotations = (
5883            self.get_param()
5884            .get("annotation", {})
5885            .get("parquet", {})
5886            .get("annotations", None)
5887        )
5888        log.debug("Annotations: " + str(annotations))
5889
5890        # Assembly
5891        assembly = self.get_param().get(
5892            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5893        )
5894
5895        # Force Update Annotation
5896        force_update_annotation = (
5897            self.get_param()
5898            .get("annotation", {})
5899            .get("options", {})
5900            .get("annotations_update", False)
5901        )
5902        log.debug(f"force_update_annotation={force_update_annotation}")
5903        force_append_annotation = (
5904            self.get_param()
5905            .get("annotation", {})
5906            .get("options", {})
5907            .get("annotations_append", False)
5908        )
5909        log.debug(f"force_append_annotation={force_append_annotation}")
5910
5911        # Data
5912        table_variants = self.get_table_variants()
5913
5914        # Check if not empty
5915        log.debug("Check if not empty")
5916        sql_query_chromosomes_df = self.get_query_to_df(
5917            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5918        )
5919        if not sql_query_chromosomes_df["count"][0]:
5920            log.info(f"VCF empty")
5921            return
5922
5923        # VCF header
5924        vcf_reader = self.get_header()
5925        log.debug("Initial header: " + str(vcf_reader.infos))
5926
5927        # Nb Variants POS
5928        log.debug("NB Variants Start")
5929        nb_variants = self.conn.execute(
5930            f"SELECT count(*) AS count FROM variants"
5931        ).fetchdf()["count"][0]
5932        log.debug("NB Variants Stop")
5933
5934        # Existing annotations
5935        for vcf_annotation in self.get_header().infos:
5936
5937            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5938            log.debug(
5939                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5940            )
5941
5942        # Added columns
5943        added_columns = []
5944
5945        # drop indexes
5946        log.debug(f"Drop indexes...")
5947        self.drop_indexes()
5948
5949        if annotations:
5950
5951            if "ALL" in annotations:
5952
5953                all_param = annotations.get("ALL", {})
5954                all_param_formats = all_param.get("formats", None)
5955                all_param_releases = all_param.get("releases", None)
5956
5957                databases_infos_dict = self.scan_databases(
5958                    database_formats=all_param_formats,
5959                    database_releases=all_param_releases,
5960                )
5961                for database_infos in databases_infos_dict.keys():
5962                    if database_infos not in annotations:
5963                        annotations[database_infos] = {"INFO": None}
5964
5965            for annotation in annotations:
5966
5967                if annotation in ["ALL"]:
5968                    continue
5969
5970                # Annotation Name
5971                annotation_name = os.path.basename(annotation)
5972
5973                # Annotation fields
5974                annotation_fields = annotations[annotation]
5975                if not annotation_fields:
5976                    annotation_fields = {"INFO": None}
5977
5978                log.debug(f"Annotation '{annotation_name}'")
5979                log.debug(
5980                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5981                )
5982
5983                # Create Database
5984                database = Database(
5985                    database=annotation,
5986                    databases_folders=databases_folders,
5987                    assembly=assembly,
5988                )
5989
5990                # Find files
5991                parquet_file = database.get_database()
5992                parquet_hdr_file = database.get_header_file()
5993                parquet_type = database.get_type()
5994
5995                # Check if files exists
5996                if not parquet_file or not parquet_hdr_file:
5997                    msg_err_list = []
5998                    if not parquet_file:
5999                        msg_err_list.append(
6000                            f"Annotation failed: Annotation file not found"
6001                        )
6002                    if parquet_file and not parquet_hdr_file:
6003                        msg_err_list.append(
6004                            f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'"
6005                        )
6006
6007                    log.error(". ".join(msg_err_list))
6008                    raise ValueError(". ".join(msg_err_list))
6009                else:
6010                    # Get parquet connexion
6011                    parquet_sql_attach = database.get_sql_database_attach(
6012                        output="query"
6013                    )
6014                    if parquet_sql_attach:
6015                        self.conn.execute(parquet_sql_attach)
6016                    parquet_file_link = database.get_sql_database_link()
6017                    # Log
6018                    log.debug(
6019                        f"Annotation '{annotation_name}' - file: "
6020                        + str(parquet_file)
6021                        + " and "
6022                        + str(parquet_hdr_file)
6023                    )
6024
6025                    # Database full header columns
6026                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
6027                        parquet_hdr_file
6028                    )
6029                    # Log
6030                    log.debug(
6031                        "Annotation database header columns : "
6032                        + str(parquet_hdr_vcf_header_columns)
6033                    )
6034
6035                    # Load header as VCF object
6036                    parquet_hdr_vcf_header_infos = database.get_header().infos
6037                    # Log
6038                    log.debug(
6039                        "Annotation database header: "
6040                        + str(parquet_hdr_vcf_header_infos)
6041                    )
6042
6043                    # Get extra infos
6044                    parquet_columns = database.get_extra_columns()
6045                    # Log
6046                    log.debug("Annotation database Columns: " + str(parquet_columns))
6047
6048                    # Add extra columns if "ALL" in annotation_fields
6049                    # if "ALL" in annotation_fields:
6050                    #     allow_add_extra_column = True
6051                    if "ALL" in annotation_fields and database.get_extra_columns():
6052                        for extra_column in database.get_extra_columns():
6053                            if (
6054                                extra_column not in annotation_fields
6055                                and extra_column.replace("INFO/", "")
6056                                not in parquet_hdr_vcf_header_infos
6057                            ):
6058                                parquet_hdr_vcf_header_infos[extra_column] = (
6059                                    vcf.parser._Info(
6060                                        extra_column,
6061                                        ".",
6062                                        "String",
6063                                        f"{extra_column} description",
6064                                        "unknown",
6065                                        "unknown",
6066                                        self.code_type_map["String"],
6067                                    )
6068                                )
6069
6070                    # For all fields in database
6071                    annotation_fields_all = False
6072                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
6073                        annotation_fields_all = True
6074                        annotation_fields = {
6075                            key: key for key in parquet_hdr_vcf_header_infos
6076                        }
6077
6078                        log.debug(
6079                            "Annotation database header - All annotations added: "
6080                            + str(annotation_fields)
6081                        )
6082
6083                    # Init
6084
6085                    # List of annotation fields to use
6086                    sql_query_annotation_update_info_sets = []
6087
6088                    # List of annotation to agregate
6089                    sql_query_annotation_to_agregate = []
6090
6091                    # Number of fields
6092                    nb_annotation_field = 0
6093
6094                    # Annotation fields processed
6095                    annotation_fields_processed = []
6096
6097                    # Columns mapping
6098                    map_columns = database.map_columns(
6099                        columns=annotation_fields, prefixes=["INFO/"]
6100                    )
6101
6102                    # Query dict for fields to remove (update option)
6103                    query_dict_remove = {}
6104
6105                    # Fetch Anotation fields
6106                    for annotation_field in annotation_fields:
6107
6108                        # annotation_field_column
6109                        annotation_field_column = map_columns.get(
6110                            annotation_field, "INFO"
6111                        )
6112
6113                        # field new name, if parametered
6114                        annotation_fields_new_name = annotation_fields.get(
6115                            annotation_field, annotation_field
6116                        )
6117                        if not annotation_fields_new_name:
6118                            annotation_fields_new_name = annotation_field
6119
6120                        # To annotate
6121                        # force_update_annotation = True
6122                        # force_append_annotation = True
6123                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
6124                        if annotation_field in parquet_hdr_vcf_header_infos and (
6125                            force_update_annotation
6126                            or force_append_annotation
6127                            or (
6128                                annotation_fields_new_name
6129                                not in self.get_header().infos
6130                            )
6131                        ):
6132
6133                            # Add field to annotation to process list
6134                            annotation_fields_processed.append(
6135                                annotation_fields_new_name
6136                            )
6137
6138                            # explode infos for the field
6139                            annotation_fields_new_name_info_msg = ""
6140                            if (
6141                                force_update_annotation
6142                                and annotation_fields_new_name
6143                                in self.get_header().infos
6144                            ):
6145                                # Remove field from INFO
6146                                query = f"""
6147                                    UPDATE {table_variants} as table_variants
6148                                    SET INFO = REGEXP_REPLACE(
6149                                                concat(table_variants.INFO,''),
6150                                                ';*{annotation_fields_new_name}=[^;]*',
6151                                                ''
6152                                                )
6153                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
6154                                """
6155                                annotation_fields_new_name_info_msg = " [update]"
6156                                query_dict_remove[
6157                                    f"remove 'INFO/{annotation_fields_new_name}'"
6158                                ] = query
6159
6160                            # Sep between fields in INFO
6161                            nb_annotation_field += 1
6162                            if nb_annotation_field > 1:
6163                                annotation_field_sep = ";"
6164                            else:
6165                                annotation_field_sep = ""
6166
6167                            log.info(
6168                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
6169                            )
6170
6171                            # Add INFO field to header
6172                            parquet_hdr_vcf_header_infos_number = (
6173                                parquet_hdr_vcf_header_infos[annotation_field].num
6174                                or "."
6175                            )
6176                            parquet_hdr_vcf_header_infos_type = (
6177                                parquet_hdr_vcf_header_infos[annotation_field].type
6178                                or "String"
6179                            )
6180                            parquet_hdr_vcf_header_infos_description = (
6181                                parquet_hdr_vcf_header_infos[annotation_field].desc
6182                                or f"{annotation_field} description"
6183                            )
6184                            parquet_hdr_vcf_header_infos_source = (
6185                                parquet_hdr_vcf_header_infos[annotation_field].source
6186                                or "unknown"
6187                            )
6188                            parquet_hdr_vcf_header_infos_version = (
6189                                parquet_hdr_vcf_header_infos[annotation_field].version
6190                                or "unknown"
6191                            )
6192
6193                            vcf_reader.infos[annotation_fields_new_name] = (
6194                                vcf.parser._Info(
6195                                    annotation_fields_new_name,
6196                                    parquet_hdr_vcf_header_infos_number,
6197                                    parquet_hdr_vcf_header_infos_type,
6198                                    parquet_hdr_vcf_header_infos_description,
6199                                    parquet_hdr_vcf_header_infos_source,
6200                                    parquet_hdr_vcf_header_infos_version,
6201                                    self.code_type_map[
6202                                        parquet_hdr_vcf_header_infos_type
6203                                    ],
6204                                )
6205                            )
6206
6207                            # Append
6208                            if force_append_annotation:
6209                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
6210                            else:
6211                                query_case_when_append = ""
6212
6213                            # Annotation/Update query fields
6214                            # Found in INFO column
6215                            if (
6216                                annotation_field_column == "INFO"
6217                                and "INFO" in parquet_hdr_vcf_header_columns
6218                            ):
6219                                sql_query_annotation_update_info_sets.append(
6220                                    f"""
6221                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
6222                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
6223                                        ELSE ''
6224                                    END
6225                                """
6226                                )
6227                            # Found in a specific column
6228                            else:
6229                                sql_query_annotation_update_info_sets.append(
6230                                    f"""
6231                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
6232                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
6233                                        ELSE ''
6234                                    END
6235                                """
6236                                )
6237                                sql_query_annotation_to_agregate.append(
6238                                    f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
6239                                )
6240
6241                        # Not to annotate
6242                        else:
6243
6244                            if force_update_annotation:
6245                                annotation_message = "forced"
6246                            else:
6247                                annotation_message = "skipped"
6248
6249                            if annotation_field not in parquet_hdr_vcf_header_infos:
6250                                log.warning(
6251                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
6252                                )
6253                            if annotation_fields_new_name in self.get_header().infos:
6254                                log.warning(
6255                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
6256                                )
6257
6258                    # Check if ALL fields have to be annotated. Thus concat all INFO field
6259                    # allow_annotation_full_info = True
6260                    allow_annotation_full_info = not force_append_annotation
6261
6262                    if parquet_type in ["regions"]:
6263                        allow_annotation_full_info = False
6264
6265                    if (
6266                        allow_annotation_full_info
6267                        and nb_annotation_field == len(annotation_fields)
6268                        and annotation_fields_all
6269                        and (
6270                            "INFO" in parquet_hdr_vcf_header_columns
6271                            and "INFO" in database.get_extra_columns()
6272                        )
6273                    ):
6274                        log.debug("Column INFO annotation enabled")
6275                        sql_query_annotation_update_info_sets = []
6276                        sql_query_annotation_update_info_sets.append(
6277                            f" table_parquet.INFO "
6278                        )
6279
6280                    if sql_query_annotation_update_info_sets:
6281
6282                        # Annotate
6283                        log.info(f"Annotation '{annotation_name}' - Annotation...")
6284
6285                        # Join query annotation update info sets for SQL
6286                        sql_query_annotation_update_info_sets_sql = ",".join(
6287                            sql_query_annotation_update_info_sets
6288                        )
6289
6290                        # Check chromosomes list (and variants infos)
6291                        sql_query_chromosomes = f"""
6292                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
6293                            FROM {table_variants} as table_variants
6294                            GROUP BY table_variants."#CHROM"
6295                            ORDER BY table_variants."#CHROM"
6296                            """
6297                        sql_query_chromosomes_df = self.conn.execute(
6298                            sql_query_chromosomes
6299                        ).df()
6300                        sql_query_chromosomes_dict = {
6301                            entry["CHROM"]: {
6302                                "count": entry["count_variants"],
6303                                "min": entry["min_variants"],
6304                                "max": entry["max_variants"],
6305                            }
6306                            for index, entry in sql_query_chromosomes_df.iterrows()
6307                        }
6308
6309                        # Init
6310                        nb_of_query = 0
6311                        nb_of_variant_annotated = 0
6312                        query_dict = query_dict_remove
6313
6314                        # for chrom in sql_query_chromosomes_df["CHROM"]:
6315                        for chrom in sql_query_chromosomes_dict:
6316
6317                            # Number of variant by chromosome
6318                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
6319                                chrom, {}
6320                            ).get("count", 0)
6321
6322                            log.debug(
6323                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
6324                            )
6325
6326                            # Annotation with regions database
6327                            if parquet_type in ["regions"]:
6328                                sql_query_annotation_from_clause = f"""
6329                                    FROM (
6330                                        SELECT 
6331                                            '{chrom}' AS \"#CHROM\",
6332                                            table_variants_from.\"POS\" AS \"POS\",
6333                                            {",".join(sql_query_annotation_to_agregate)}
6334                                        FROM {table_variants} as table_variants_from
6335                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
6336                                            table_parquet_from."#CHROM" = '{chrom}'
6337                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
6338                                            AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
6339                                        )
6340                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
6341                                        GROUP BY table_variants_from.\"POS\"
6342                                        )
6343                                        as table_parquet
6344                                """
6345
6346                                sql_query_annotation_where_clause = """
6347                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
6348                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6349                                """
6350
6351                            # Annotation with variants database
6352                            else:
6353                                sql_query_annotation_from_clause = f"""
6354                                    FROM {parquet_file_link} as table_parquet
6355                                """
6356                                sql_query_annotation_where_clause = f"""
6357                                    table_variants."#CHROM" = '{chrom}'
6358                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
6359                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
6360                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
6361                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
6362                                """
6363
6364                            # Create update query
6365                            sql_query_annotation_chrom_interval_pos = f"""
6366                                UPDATE {table_variants} as table_variants
6367                                    SET INFO = 
6368                                        concat(
6369                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6370                                                THEN table_variants.INFO
6371                                                ELSE ''
6372                                            END
6373                                            ,
6374                                            CASE WHEN table_variants.INFO NOT IN ('','.')
6375                                                        AND (
6376                                                        concat({sql_query_annotation_update_info_sets_sql})
6377                                                        )
6378                                                        NOT IN ('','.') 
6379                                                    THEN ';'
6380                                                    ELSE ''
6381                                            END
6382                                            ,
6383                                            {sql_query_annotation_update_info_sets_sql}
6384                                            )
6385                                    {sql_query_annotation_from_clause}
6386                                    WHERE {sql_query_annotation_where_clause}
6387                                    ;
6388                                """
6389
6390                            # Add update query to dict
6391                            query_dict[
6392                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6393                            ] = sql_query_annotation_chrom_interval_pos
6394
6395                        nb_of_query = len(query_dict)
6396                        num_query = 0
6397
6398                        # SET max_expression_depth TO x
6399                        self.conn.execute("SET max_expression_depth TO 10000")
6400
6401                        for query_name in query_dict:
6402                            query = query_dict[query_name]
6403                            num_query += 1
6404                            log.info(
6405                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6406                            )
6407                            result = self.conn.execute(query)
6408                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6409                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6410                            log.info(
6411                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6412                            )
6413
6414                        log.info(
6415                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6416                        )
6417
6418                    else:
6419
6420                        log.info(
6421                            f"Annotation '{annotation_name}' - No Annotations available"
6422                        )
6423
6424                    log.debug("Final header: " + str(vcf_reader.infos))
6425
6426        # Remove added columns
6427        for added_column in added_columns:
6428            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6430    def annotation_splice(self, threads: int = None) -> None:
6431        """
6432        This function annotate with snpEff
6433
6434        :param threads: The number of threads to use
6435        :return: the value of the variable "return_value".
6436        """
6437
6438        # DEBUG
6439        log.debug("Start annotation with splice tools")
6440
6441        # Threads
6442        if not threads:
6443            threads = self.get_threads()
6444        log.debug("Threads: " + str(threads))
6445
6446        # DEBUG
6447        delete_tmp = True
6448        if self.get_config().get("verbosity", "warning") in ["debug"]:
6449            delete_tmp = False
6450            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6451
6452        # Config
6453        config = self.get_config()
6454        log.debug("Config: " + str(config))
6455        splice_config = config.get("tools", {}).get("splice", {})
6456        if not splice_config:
6457            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6458            msg_err = "No Splice tool config"
6459            raise ValueError(msg_err)
6460        log.debug(f"splice_config: {splice_config}")
6461
6462        # Config - Folders - Databases
6463        databases_folders = (
6464            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6465        )
6466        log.debug("Databases annotations: " + str(databases_folders))
6467
6468        # Splice docker image
6469        splice_docker_image = splice_config.get("docker").get("image")
6470
6471        # Pull splice image if it's not already there
6472        if not check_docker_image_exists(splice_docker_image):
6473            log.warning(
6474                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6475            )
6476            try:
6477                command(f"docker pull {splice_config.get('docker').get('image')}")
6478            except subprocess.CalledProcessError:
6479                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6480                log.error(msg_err)
6481                raise ValueError(msg_err)
6482
6483        # Config - splice databases
6484        splice_databases = (
6485            config.get("folders", {})
6486            .get("databases", {})
6487            .get("splice", DEFAULT_SPLICE_FOLDER)
6488        )
6489        splice_databases = full_path(splice_databases)
6490
6491        # Param
6492        param = self.get_param()
6493        log.debug("Param: " + str(param))
6494
6495        # Param
6496        options = param.get("annotation", {}).get("splice", {}).get("options", {})
6497        log.debug("Options: " + str(options))
6498
6499        # Data
6500        table_variants = self.get_table_variants()
6501
6502        # Check if not empty
6503        log.debug("Check if not empty")
6504        sql_query_chromosomes = (
6505            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6506        )
6507        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6508            log.info("VCF empty")
6509            return None
6510
6511        # Export in VCF
6512        log.debug("Create initial file to annotate")
6513
6514        # Create output folder / work folder
6515        if options.get("output_folder", ""):
6516            output_folder = options.get("output_folder", "")
6517            if not os.path.exists(output_folder):
6518                Path(output_folder).mkdir(parents=True, exist_ok=True)
6519        else:
6520            output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6521            if not os.path.exists(output_folder):
6522                Path(output_folder).mkdir(parents=True, exist_ok=True)
6523
6524        if options.get("workdir", ""):
6525            workdir = options.get("workdir", "")
6526        else:
6527            workdir = "/work"
6528
6529        # Create tmp VCF file
6530        tmp_vcf = NamedTemporaryFile(
6531            prefix=self.get_prefix(),
6532            dir=output_folder,
6533            suffix=".vcf",
6534            delete=False,
6535        )
6536        tmp_vcf_name = tmp_vcf.name
6537
6538        # VCF header
6539        header = self.get_header()
6540
6541        # Existing annotations
6542        for vcf_annotation in self.get_header().infos:
6543
6544            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6545            log.debug(
6546                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6547            )
6548
6549        # Memory limit
6550        if config.get("memory", None):
6551            memory_limit = config.get("memory", "8G").upper()
6552            # upper()
6553        else:
6554            memory_limit = "8G"
6555        log.debug(f"memory_limit: {memory_limit}")
6556
6557        # Check number of variants to annotate
6558        where_clause_regex_spliceai = r"SpliceAI_\w+"
6559        where_clause_regex_spip = r"SPiP_\w+"
6560        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6561        df_list_of_variants_to_annotate = self.get_query_to_df(
6562            query=f""" SELECT * FROM variants {where_clause} """
6563        )
6564        if len(df_list_of_variants_to_annotate) == 0:
6565            log.warning(
6566                f"No variants to annotate with splice. Variants probably already annotated with splice"
6567            )
6568            return None
6569        else:
6570            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6571
6572        # Export VCF file
6573        self.export_variant_vcf(
6574            vcf_file=tmp_vcf_name,
6575            remove_info=True,
6576            add_samples=True,
6577            index=False,
6578            where_clause=where_clause,
6579        )
6580        mount = [f" -v {path}:{path}:rw" for path in [output_folder]]
6581        if any(value for value in splice_config.values() if value is None):
6582            log.warning("At least one splice config parameter is empty")
6583            # exit annotation_splice
6584            return None
6585
6586        # Params in splice nf
6587        def check_values(dico: dict):
6588            """
6589            Ensure parameters for NF splice pipeline
6590            """
6591            for key, val in dico.items():
6592                if key == "genome":
6593                    if any(
6594                        assemb in options.get("genome", {})
6595                        for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6596                    ):
6597                        yield f"--{key} hg19"
6598                    elif any(
6599                        assemb in options.get("genome", {})
6600                        for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6601                    ):
6602                        yield f"--{key} hg38"
6603                elif (
6604                    (isinstance(val, str) and val)
6605                    or isinstance(val, int)
6606                    or isinstance(val, bool)
6607                ):
6608                    yield f"--{key} {val}"
6609
6610        # Genome
6611        genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6612        options["genome"] = genome
6613        # NF params
6614        nf_params = []
6615        # Add options
6616        if options:
6617            log.debug(options)
6618            nf_params = list(check_values(options))
6619            log.debug(f"Splice NF params: {' '.join(nf_params)}")
6620        else:
6621            log.debug("No NF params provided")
6622        # Add threads
6623        if "threads" not in options.keys():
6624            nf_params.append(f"--threads {threads}")
6625        # Genome path
6626        genome_path = find_genome(
6627            config.get("folders", {})
6628            .get("databases", {})
6629            .get("genomes", DEFAULT_GENOME_FOLDER),
6630            file=f"{genome}.fa",
6631        )
6632        # Add genome path
6633        if not genome_path:
6634            raise ValueError(
6635                f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6636            )
6637        else:
6638            log.debug(f"Genome: {genome_path}")
6639            nf_params.append(f"--genome_path {genome_path}")
6640
6641        def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6642            """
6643            Setting up updated databases for SPiP and SpliceAI
6644            """
6645
6646            try:
6647
6648                # SpliceAI assembly transcriptome
6649                spliceai_assembly = os.path.join(
6650                    config.get("folders", {}).get("databases", {}).get("spliceai", {}),
6651                    options.get("genome"),
6652                    "transcriptome",
6653                )
6654                spip_assembly = options.get("genome")
6655
6656                spip = find(
6657                    f"transcriptome_{spip_assembly}.RData",
6658                    config.get("folders", {}).get("databases", {}).get("spip", {}),
6659                )
6660                spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6661                log.debug(f"SPiP annotations: {spip}")
6662                log.debug(f"SpliceAI annotations: {spliceai}")
6663                if spip and spliceai:
6664                    return [
6665                        f"--spip_transcriptome {spip}",
6666                        f"--spliceai_transcriptome {spliceai}",
6667                    ]
6668                else:
6669                    log.warning(
6670                        "Can't find splice databases in configuration, use annotations file from image"
6671                    )
6672            except TypeError:
6673                log.warning(
6674                    "Can't find splice databases in configuration, use annotations file from image"
6675                )
6676                return []
6677
6678        # Add options, check if transcriptome option have already beend provided
6679        if (
6680            "spip_transcriptome" not in nf_params
6681            and "spliceai_transcriptome" not in nf_params
6682        ):
6683            splice_reference = splice_annotations(options, config)
6684            if splice_reference:
6685                nf_params.extend(splice_reference)
6686        # nf_params.append(f"--output_folder {output_folder}")
6687        random_uuid = f"HOWARD-SPLICE-{get_random()}"
6688        cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6689        log.debug(cmd)
6690        splice_config["docker"]["command"] = cmd
6691
6692        # Ensure proxy is set
6693        proxy = [
6694            f"-e {var}={os.getenv(var)}"
6695            for var in ["https_proxy", "http_proxy", "ftp_proxy"]
6696            if os.getenv(var) is not None
6697        ]
6698        docker_cmd = get_bin_command(
6699            tool="splice",
6700            bin_type="docker",
6701            config=config,
6702            default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6703            add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}",
6704        )
6705        # print(docker_cmd)
6706        # exit()
6707        # Docker debug
6708        # if splice_config.get("rm_container"):
6709        #     rm_container = "--rm"
6710        # else:
6711        #     rm_container = ""
6712        # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6713        log.debug(docker_cmd)
6714        res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6715        log.debug(res.stdout)
6716        if res.stderr:
6717            log.error(res.stderr)
6718        res.check_returncode()
6719        # Update variants
6720        log.info("Annotation - Updating...")
6721        # Test find output vcf
6722        log.debug(
6723            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6724        )
6725        output_vcf = []
6726        # Wrong folder to look in
6727        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6728            if (
6729                files
6730                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6731            ):
6732                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6733        # log.debug(os.listdir(options.get("output_folder")))
6734        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6735        if not output_vcf:
6736            log.debug(
6737                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6738            )
6739        else:
6740            # Get new header from annotated vcf
6741            log.debug(f"Initial header: {len(header.infos)} fields")
6742            # Create new header with splice infos
6743            new_vcf = Variants(input=output_vcf[0])
6744            new_vcf_header = new_vcf.get_header().infos
6745            for keys, infos in new_vcf_header.items():
6746                if keys not in header.infos.keys():
6747                    header.infos[keys] = infos
6748            log.debug(f"New header: {len(header.infos)} fields")
6749            log.debug(f"Splice tmp output: {output_vcf[0]}")
6750            self.update_from_vcf(output_vcf[0])
6751
6752        # Remove file
6753        remove_if_exists(output_vcf)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6759    def get_config_default(self, name: str) -> dict:
6760        """
6761        The function `get_config_default` returns a dictionary containing default configurations for
6762        various calculations and prioritizations.
6763
6764        :param name: The `get_config_default` function returns a dictionary containing default
6765        configurations for different calculations and prioritizations. The `name` parameter is used to
6766        specify which specific configuration to retrieve from the dictionary
6767        :type name: str
6768        :return: The function `get_config_default` returns a dictionary containing default configuration
6769        settings for different calculations and prioritizations. The specific configuration settings are
6770        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6771        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6772        returned. If there is no match, an empty dictionary is returned.
6773        """
6774
6775        config_default = {
6776            "calculations": {
6777                "variant_chr_pos_alt_ref": {
6778                    "type": "sql",
6779                    "name": "variant_chr_pos_alt_ref",
6780                    "description": "Create a variant ID with chromosome, position, alt and ref",
6781                    "available": False,
6782                    "output_column_name": "variant_chr_pos_alt_ref",
6783                    "output_column_type": "String",
6784                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6785                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6786                    "operation_info": True,
6787                },
6788                "VARTYPE": {
6789                    "type": "sql",
6790                    "name": "VARTYPE",
6791                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6792                    "available": True,
6793                    "table": "variants",
6794                    "output_column_name": "VARTYPE",
6795                    "output_column_type": "String",
6796                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6797                    "operation_query": """
6798                            CASE
6799                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6800                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6801                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6802                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6803                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6804                                ELSE 'UNDEFINED'
6805                            END
6806                            """,
6807                    "info_fields": ["SVTYPE"],
6808                    "operation_info": True,
6809                },
6810                "snpeff_hgvs": {
6811                    "type": "python",
6812                    "name": "snpeff_hgvs",
6813                    "description": "HGVS nomenclatures from snpEff annotation",
6814                    "available": True,
6815                    "function_name": "calculation_extract_snpeff_hgvs",
6816                    "function_params": ["snpeff_hgvs", "ANN"],
6817                },
6818                "snpeff_ann_explode": {
6819                    "type": "python",
6820                    "name": "snpeff_ann_explode",
6821                    "description": "Explode snpEff annotations with uniquify values",
6822                    "available": True,
6823                    "function_name": "calculation_snpeff_ann_explode",
6824                    "function_params": [False, "fields", "snpeff_", "ANN"],
6825                },
6826                "snpeff_ann_explode_uniquify": {
6827                    "type": "python",
6828                    "name": "snpeff_ann_explode_uniquify",
6829                    "description": "Explode snpEff annotations",
6830                    "available": True,
6831                    "function_name": "calculation_snpeff_ann_explode",
6832                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6833                },
6834                "snpeff_ann_explode_json": {
6835                    "type": "python",
6836                    "name": "snpeff_ann_explode_json",
6837                    "description": "Explode snpEff annotations in JSON format",
6838                    "available": True,
6839                    "function_name": "calculation_snpeff_ann_explode",
6840                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6841                },
6842                "NOMEN": {
6843                    "type": "python",
6844                    "name": "NOMEN",
6845                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)",
6846                    "available": True,
6847                    "function_name": "calculation_extract_nomen",
6848                    "function_params": [],
6849                },
6850                "RENAME_INFO_FIELDS": {
6851                    "type": "python",
6852                    "name": "RENAME_INFO_FIELDS",
6853                    "description": "Rename or remove INFO/tags",
6854                    "available": True,
6855                    "function_name": "calculation_rename_info_fields",
6856                    "function_params": [],
6857                },
6858                "FINDBYPIPELINE": {
6859                    "type": "python",
6860                    "name": "FINDBYPIPELINE",
6861                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6862                    "available": True,
6863                    "function_name": "calculation_find_by_pipeline",
6864                    "function_params": ["findbypipeline"],
6865                },
6866                "FINDBYSAMPLE": {
6867                    "type": "python",
6868                    "name": "FINDBYSAMPLE",
6869                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6870                    "available": True,
6871                    "function_name": "calculation_find_by_pipeline",
6872                    "function_params": ["findbysample"],
6873                },
6874                "GENOTYPECONCORDANCE": {
6875                    "type": "python",
6876                    "name": "GENOTYPECONCORDANCE",
6877                    "description": "Concordance of genotype for multi caller VCF",
6878                    "available": True,
6879                    "function_name": "calculation_genotype_concordance",
6880                    "function_params": [],
6881                },
6882                "BARCODE": {
6883                    "type": "python",
6884                    "name": "BARCODE",
6885                    "description": "BARCODE as VaRank tool",
6886                    "available": True,
6887                    "function_name": "calculation_barcode",
6888                    "function_params": [],
6889                },
6890                "BARCODEFAMILY": {
6891                    "type": "python",
6892                    "name": "BARCODEFAMILY",
6893                    "description": "BARCODEFAMILY as VaRank tool",
6894                    "available": True,
6895                    "function_name": "calculation_barcode_family",
6896                    "function_params": ["BCF"],
6897                },
6898                "TRIO": {
6899                    "type": "python",
6900                    "name": "TRIO",
6901                    "description": "Inheritance for a trio family",
6902                    "available": True,
6903                    "function_name": "calculation_trio",
6904                    "function_params": [],
6905                },
6906                "VAF": {
6907                    "type": "python",
6908                    "name": "VAF",
6909                    "description": "Variant Allele Frequency (VAF) harmonization",
6910                    "available": True,
6911                    "function_name": "calculation_vaf_normalization",
6912                    "function_params": [],
6913                },
6914                "VAF_stats": {
6915                    "type": "python",
6916                    "name": "VAF_stats",
6917                    "description": "Variant Allele Frequency (VAF) statistics",
6918                    "available": True,
6919                    "function_name": "calculation_genotype_stats",
6920                    "function_params": ["VAF"],
6921                },
6922                "DP_stats": {
6923                    "type": "python",
6924                    "name": "DP_stats",
6925                    "description": "Depth (DP) statistics",
6926                    "available": True,
6927                    "function_name": "calculation_genotype_stats",
6928                    "function_params": ["DP"],
6929                },
6930                "variant_id": {
6931                    "type": "python",
6932                    "name": "variant_id",
6933                    "description": "Variant ID generated from variant position and type",
6934                    "available": True,
6935                    "function_name": "calculation_variant_id",
6936                    "function_params": [],
6937                },
6938                "transcripts_json": {
6939                    "type": "python",
6940                    "name": "transcripts_json",
6941                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6942                    "available": True,
6943                    "function_name": "calculation_transcripts_annotation",
6944                    "function_params": ["transcripts_json", None],
6945                },
6946                "transcripts_ann": {
6947                    "type": "python",
6948                    "name": "transcripts_ann",
6949                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6950                    "available": True,
6951                    "function_name": "calculation_transcripts_annotation",
6952                    "function_params": [None, "transcripts_ann"],
6953                },
6954                "transcripts_annotations": {
6955                    "type": "python",
6956                    "name": "transcripts_annotations",
6957                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6958                    "available": True,
6959                    "function_name": "calculation_transcripts_annotation",
6960                    "function_params": [None, None],
6961                },
6962                "transcripts_prioritization": {
6963                    "type": "python",
6964                    "name": "transcripts_prioritization",
6965                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6966                    "available": True,
6967                    "function_name": "calculation_transcripts_prioritization",
6968                    "function_params": [],
6969                },
6970                "transcripts_export": {
6971                    "type": "python",
6972                    "name": "transcripts_export",
6973                    "description": "Export transcripts table/view as a file (using param.json)",
6974                    "available": True,
6975                    "function_name": "calculation_transcripts_export",
6976                    "function_params": [],
6977                },
6978            },
6979            "prioritizations": {
6980                "default": {
6981                    "ANN2": [
6982                        {
6983                            "type": "contains",
6984                            "value": "HIGH",
6985                            "score": 5,
6986                            "flag": "PASS",
6987                            "comment": [
6988                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6989                            ],
6990                        },
6991                        {
6992                            "type": "contains",
6993                            "value": "MODERATE",
6994                            "score": 3,
6995                            "flag": "PASS",
6996                            "comment": [
6997                                "A non-disruptive variant that might change protein effectiveness"
6998                            ],
6999                        },
7000                        {
7001                            "type": "contains",
7002                            "value": "LOW",
7003                            "score": 0,
7004                            "flag": "FILTERED",
7005                            "comment": [
7006                                "Assumed to be mostly harmless or unlikely to change protein behavior"
7007                            ],
7008                        },
7009                        {
7010                            "type": "contains",
7011                            "value": "MODIFIER",
7012                            "score": 0,
7013                            "flag": "FILTERED",
7014                            "comment": [
7015                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
7016                            ],
7017                        },
7018                    ],
7019                }
7020            },
7021        }
7022
7023        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
7025    def get_config_json(
7026        self, name: str, config_dict: dict = {}, config_file: str = None
7027    ) -> dict:
7028        """
7029        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
7030        default values, a dictionary, and a file.
7031
7032        :param name: The `name` parameter in the `get_config_json` function is a string that represents
7033        the name of the configuration. It is used to identify and retrieve the configuration settings
7034        for a specific component or module
7035        :type name: str
7036        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
7037        dictionary that allows you to provide additional configuration settings or overrides. When you
7038        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
7039        the key is the configuration setting you want to override or
7040        :type config_dict: dict
7041        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
7042        specify the path to a configuration file that contains additional settings. If provided, the
7043        function will read the contents of this file and update the configuration dictionary with the
7044        values found in the file, overriding any existing values with the
7045        :type config_file: str
7046        :return: The function `get_config_json` returns a dictionary containing the configuration
7047        settings.
7048        """
7049
7050        # Create with default prioritizations
7051        config_default = self.get_config_default(name=name)
7052        configuration = config_default
7053        # log.debug(f"configuration={configuration}")
7054
7055        # Replace prioritizations from dict
7056        for config in config_dict:
7057            configuration[config] = config_dict[config]
7058
7059        # Replace prioritizations from file
7060        config_file = full_path(config_file)
7061        if config_file:
7062            if os.path.exists(config_file):
7063                with open(config_file) as config_file_content:
7064                    config_file_dict = yaml.safe_load(config_file_content)
7065                for config in config_file_dict:
7066                    configuration[config] = config_file_dict[config]
7067            else:
7068                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
7069                log.error(msg_error)
7070                raise ValueError(msg_error)
7071
7072        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
7074    def prioritization(
7075        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
7076    ) -> bool:
7077        """
7078        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
7079        prioritizes variants based on configured profiles and criteria.
7080
7081        :param table: The `table` parameter in the `prioritization` function is used to specify the name
7082        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
7083        a table name is provided, the method will prioritize the variants in that specific table
7084        :type table: str
7085        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
7086        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
7087        provided, the code will use a default prefix value of "PZ"
7088        :type pz_prefix: str
7089        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
7090        additional parameters specific to the prioritization process. These parameters can include
7091        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
7092        configurations needed for the prioritization of variants in a V
7093        :type pz_param: dict
7094        :return: A boolean value (True) is being returned from the `prioritization` function.
7095        """
7096
7097        # Config
7098        config = self.get_config()
7099
7100        # Param
7101        param = self.get_param()
7102
7103        # Prioritization param
7104        if pz_param is not None:
7105            prioritization_param = pz_param
7106        else:
7107            prioritization_param = param.get("prioritization", {})
7108
7109        # Configuration profiles
7110        prioritization_config_file = prioritization_param.get(
7111            "prioritization_config", None
7112        )
7113        prioritization_config_file = full_path(prioritization_config_file)
7114        prioritizations_config = self.get_config_json(
7115            name="prioritizations", config_file=prioritization_config_file
7116        )
7117
7118        # Prioritization prefix
7119        pz_prefix_default = "PZ"
7120        if pz_prefix is None:
7121            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
7122
7123        # Prioritization options
7124        profiles = prioritization_param.get("profiles", [])
7125        if isinstance(profiles, str):
7126            profiles = profiles.split(",")
7127        pzfields = prioritization_param.get(
7128            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
7129        )
7130        if isinstance(pzfields, str):
7131            pzfields = pzfields.split(",")
7132        default_profile = prioritization_param.get("default_profile", None)
7133        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
7134        prioritization_score_mode = prioritization_param.get(
7135            "prioritization_score_mode", "HOWARD"
7136        )
7137
7138        # Quick Prioritizations
7139        prioritizations = param.get("prioritizations", None)
7140        if prioritizations:
7141            log.info("Quick Prioritization:")
7142            for profile in prioritizations.split(","):
7143                if profile not in profiles:
7144                    profiles.append(profile)
7145                    log.info(f"   {profile}")
7146
7147        # If profile "ALL" provided, all profiles in the config profiles
7148        if "ALL" in profiles:
7149            profiles = list(prioritizations_config.keys())
7150
7151        for profile in profiles:
7152            if prioritizations_config.get(profile, None):
7153                log.debug(f"Profile '{profile}' configured")
7154            else:
7155                msg_error = f"Profile '{profile}' NOT configured"
7156                log.error(msg_error)
7157                raise ValueError(msg_error)
7158
7159        if profiles:
7160            log.info(f"Prioritization... ")
7161        else:
7162            log.debug(f"No profile defined")
7163            return False
7164
7165        if not default_profile and len(profiles):
7166            default_profile = profiles[0]
7167
7168        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
7169        log.debug("Profiles to check: " + str(list(profiles)))
7170
7171        # Variables
7172        if table is not None:
7173            table_variants = table
7174        else:
7175            table_variants = self.get_table_variants(clause="update")
7176        log.debug(f"Table to prioritize: {table_variants}")
7177
7178        # Added columns
7179        added_columns = []
7180
7181        # Create list of PZfields
7182        # List of PZFields
7183        list_of_pzfields_original = pzfields + [
7184            pzfield + pzfields_sep + profile
7185            for pzfield in pzfields
7186            for profile in profiles
7187        ]
7188        list_of_pzfields = []
7189        log.debug(f"{list_of_pzfields_original}")
7190
7191        # Remove existing PZfields to use if exists
7192        for pzfield in list_of_pzfields_original:
7193            if self.get_header().infos.get(pzfield, None) is None:
7194                list_of_pzfields.append(pzfield)
7195                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
7196            else:
7197                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
7198
7199        if list_of_pzfields:
7200
7201            # Explode Infos prefix
7202            explode_infos_prefix = self.get_explode_infos_prefix()
7203
7204            # PZfields tags description
7205            PZfields_INFOS = {
7206                f"{pz_prefix}Tags": {
7207                    "ID": f"{pz_prefix}Tags",
7208                    "Number": ".",
7209                    "Type": "String",
7210                    "Description": "Variant tags based on annotation criteria",
7211                },
7212                f"{pz_prefix}Score": {
7213                    "ID": f"{pz_prefix}Score",
7214                    "Number": 1,
7215                    "Type": "Integer",
7216                    "Description": "Variant score based on annotation criteria",
7217                },
7218                f"{pz_prefix}Flag": {
7219                    "ID": f"{pz_prefix}Flag",
7220                    "Number": 1,
7221                    "Type": "String",
7222                    "Description": "Variant flag based on annotation criteria",
7223                },
7224                f"{pz_prefix}Comment": {
7225                    "ID": f"{pz_prefix}Comment",
7226                    "Number": ".",
7227                    "Type": "String",
7228                    "Description": "Variant comment based on annotation criteria",
7229                },
7230                f"{pz_prefix}Infos": {
7231                    "ID": f"{pz_prefix}Infos",
7232                    "Number": ".",
7233                    "Type": "String",
7234                    "Description": "Variant infos based on annotation criteria",
7235                },
7236                f"{pz_prefix}Class": {
7237                    "ID": f"{pz_prefix}Class",
7238                    "Number": ".",
7239                    "Type": "String",
7240                    "Description": "Variant class based on annotation criteria",
7241                },
7242            }
7243
7244            # Create INFO fields if not exist
7245            for field in PZfields_INFOS:
7246                field_ID = PZfields_INFOS[field]["ID"]
7247                field_description = PZfields_INFOS[field]["Description"]
7248                if field_ID not in self.get_header().infos and field_ID in pzfields:
7249                    field_description = (
7250                        PZfields_INFOS[field]["Description"]
7251                        + f", profile {default_profile}"
7252                    )
7253                    self.get_header().infos[field_ID] = vcf.parser._Info(
7254                        field_ID,
7255                        PZfields_INFOS[field]["Number"],
7256                        PZfields_INFOS[field]["Type"],
7257                        field_description,
7258                        "unknown",
7259                        "unknown",
7260                        code_type_map[PZfields_INFOS[field]["Type"]],
7261                    )
7262
7263            # Create INFO fields if not exist for each profile
7264            for profile in prioritizations_config:
7265                if profile in profiles or profiles == []:
7266                    for field in PZfields_INFOS:
7267                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
7268                        field_description = (
7269                            PZfields_INFOS[field]["Description"]
7270                            + f", profile {profile}"
7271                        )
7272                        if (
7273                            field_ID not in self.get_header().infos
7274                            and field in pzfields
7275                        ):
7276                            self.get_header().infos[field_ID] = vcf.parser._Info(
7277                                field_ID,
7278                                PZfields_INFOS[field]["Number"],
7279                                PZfields_INFOS[field]["Type"],
7280                                field_description,
7281                                "unknown",
7282                                "unknown",
7283                                code_type_map[PZfields_INFOS[field]["Type"]],
7284                            )
7285
7286            # Header
7287            for pzfield in list_of_pzfields:
7288                if re.match(f"{pz_prefix}Score.*", pzfield):
7289                    added_column = self.add_column(
7290                        table_name=table_variants,
7291                        column_name=pzfield,
7292                        column_type="INTEGER",
7293                        default_value="0",
7294                    )
7295                elif re.match(f"{pz_prefix}Flag.*", pzfield):
7296                    added_column = self.add_column(
7297                        table_name=table_variants,
7298                        column_name=pzfield,
7299                        column_type="BOOLEAN",
7300                        default_value="1",
7301                    )
7302                elif re.match(f"{pz_prefix}Class.*", pzfield):
7303                    added_column = self.add_column(
7304                        table_name=table_variants,
7305                        column_name=pzfield,
7306                        column_type="VARCHAR[]",
7307                        default_value="null",
7308                    )
7309                else:
7310                    added_column = self.add_column(
7311                        table_name=table_variants,
7312                        column_name=pzfield,
7313                        column_type="STRING",
7314                        default_value="''",
7315                    )
7316                added_columns.append(added_column)
7317
7318            # Profiles
7319            if profiles:
7320
7321                # foreach profile in configuration file
7322                for profile in prioritizations_config:
7323
7324                    # If profile is asked in param, or ALL are asked (empty profile [])
7325                    if profile in profiles or profiles == []:
7326                        log.info(f"Profile '{profile}'")
7327
7328                        sql_set_info_option = ""
7329
7330                        sql_set_info = []
7331
7332                        # PZ fields set
7333
7334                        # PZScore
7335                        if (
7336                            f"{pz_prefix}Score{pzfields_sep}{profile}"
7337                            in list_of_pzfields
7338                        ):
7339                            sql_set_info.append(
7340                                f"""
7341                                    concat(
7342                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
7343                                        {pz_prefix}Score{pzfields_sep}{profile}
7344                                    ) 
7345                                """
7346                            )
7347                            if (
7348                                profile == default_profile
7349                                and f"{pz_prefix}Score" in list_of_pzfields
7350                            ):
7351                                sql_set_info.append(
7352                                    f"""
7353                                        concat(
7354                                            '{pz_prefix}Score=',
7355                                            {pz_prefix}Score{pzfields_sep}{profile}
7356                                        )
7357                                    """
7358                                )
7359
7360                        # PZFlag
7361                        if (
7362                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
7363                            in list_of_pzfields
7364                        ):
7365                            sql_set_info.append(
7366                                f"""
7367                                    concat(
7368                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7369                                        CASE 
7370                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7371                                            THEN 'PASS'
7372                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7373                                            THEN 'FILTERED'
7374                                        END
7375                                    ) 
7376                                """
7377                            )
7378                            if (
7379                                profile == default_profile
7380                                and f"{pz_prefix}Flag" in list_of_pzfields
7381                            ):
7382                                sql_set_info.append(
7383                                    f"""
7384                                        concat(
7385                                            '{pz_prefix}Flag=',
7386                                            CASE 
7387                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7388                                                THEN 'PASS'
7389                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7390                                                THEN 'FILTERED'
7391                                            END
7392                                        )
7393                                    """
7394                                )
7395
7396                        # PZClass
7397                        if (
7398                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7399                            in list_of_pzfields
7400                        ):
7401                            sql_set_info.append(
7402                                f"""
7403                                    concat(
7404                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7405                                        CASE
7406                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7407                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7408                                            ELSE '.'
7409                                        END 
7410                                    )
7411                                    
7412                                """
7413                            )
7414                            if (
7415                                profile == default_profile
7416                                and f"{pz_prefix}Class" in list_of_pzfields
7417                            ):
7418                                sql_set_info.append(
7419                                    f"""
7420                                        concat(
7421                                            '{pz_prefix}Class=',
7422                                            CASE
7423                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7424                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7425                                                ELSE '.'
7426                                            END 
7427                                        )
7428                                    """
7429                                )
7430
7431                        # PZComment
7432                        if (
7433                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7434                            in list_of_pzfields
7435                        ):
7436                            sql_set_info.append(
7437                                f"""
7438                                    CASE
7439                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7440                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7441                                        ELSE ''
7442                                    END
7443                                """
7444                            )
7445                            if (
7446                                profile == default_profile
7447                                and f"{pz_prefix}Comment" in list_of_pzfields
7448                            ):
7449                                sql_set_info.append(
7450                                    f"""
7451                                        CASE
7452                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7453                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7454                                            ELSE ''
7455                                        END
7456                                    """
7457                                )
7458
7459                        # PZInfos
7460                        if (
7461                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7462                            in list_of_pzfields
7463                        ):
7464                            sql_set_info.append(
7465                                f"""
7466                                    CASE
7467                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7468                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7469                                        ELSE ''
7470                                    END
7471                                """
7472                            )
7473                            if (
7474                                profile == default_profile
7475                                and f"{pz_prefix}Infos" in list_of_pzfields
7476                            ):
7477                                sql_set_info.append(
7478                                    f"""
7479                                        CASE
7480                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7481                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7482                                            ELSE ''
7483                                        END
7484                                    """
7485                                )
7486
7487                        # Merge PZfields
7488                        sql_set_info_option = ""
7489                        sql_set_sep = ""
7490                        for sql_set in sql_set_info:
7491                            if sql_set_sep:
7492                                sql_set_info_option += f"""
7493                                    , concat('{sql_set_sep}', {sql_set})
7494                                """
7495                            else:
7496                                sql_set_info_option += f"""
7497                                    , {sql_set}
7498                                """
7499                            sql_set_sep = ";"
7500
7501                        sql_queries = []
7502                        criterion_fields_profile = []
7503                        annotation_view_name = (
7504                            "annotation_view_for_prioritization_"
7505                            + str(random.randrange(1000))
7506                        )
7507                        annotation_view_prefix = ""
7508                        for annotation in prioritizations_config[profile]:
7509
7510                            # skip special sections
7511                            if annotation.startswith("_"):
7512                                continue
7513
7514                            # For each criterions
7515                            for criterion in prioritizations_config[profile][
7516                                annotation
7517                            ]:
7518
7519                                # Criterion mode
7520                                criterion_mode = None
7521                                if np.any(
7522                                    np.isin(list(criterion.keys()), ["type", "value"])
7523                                ):
7524                                    criterion_mode = "operation"
7525                                elif np.any(
7526                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7527                                ):
7528                                    criterion_mode = "sql"
7529                                log.debug(f"Criterion Mode: {criterion_mode}")
7530
7531                                # Criterion parameters
7532                                criterion_type = criterion.get("type", None)
7533                                criterion_value = criterion.get("value", None)
7534                                criterion_sql = criterion.get("sql", None)
7535                                criterion_fields = criterion.get("fields", None)
7536                                criterion_score = criterion.get("score", 0)
7537                                criterion_flag = criterion.get("flag", "PASS")
7538                                criterion_class = criterion.get("class", None)
7539                                criterion_flag_bool = criterion_flag == "PASS"
7540                                criterion_comment = (
7541                                    ", ".join(criterion.get("comment", []))
7542                                    .replace("'", "''")
7543                                    .replace(";", ",")
7544                                    .replace("\t", " ")
7545                                )
7546                                criterion_infos = (
7547                                    str(criterion)
7548                                    .replace("'", "''")
7549                                    .replace(";", ",")
7550                                    .replace("\t", " ")
7551                                )
7552
7553                                # SQL
7554                                if criterion_sql is not None and isinstance(
7555                                    criterion_sql, list
7556                                ):
7557                                    criterion_sql = " ".join(criterion_sql)
7558
7559                                # Fields and explode
7560                                if criterion_fields is None:
7561                                    criterion_fields = [annotation]
7562                                if not isinstance(criterion_fields, list):
7563                                    criterion_fields = str(criterion_fields).split(",")
7564
7565                                # Class
7566                                if criterion_class is not None and not isinstance(
7567                                    criterion_class, list
7568                                ):
7569                                    criterion_class = str(criterion_class).split(",")
7570
7571                                # Add criterion fields to the list of profile's criteria
7572                                criterion_fields_profile = list(
7573                                    set(criterion_fields_profile + criterion_fields)
7574                                )
7575
7576                                sql_set = []
7577                                sql_set_info = []
7578
7579                                # PZ fields set
7580
7581                                # PZScore
7582                                if (
7583                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7584                                    in list_of_pzfields
7585                                ):
7586                                    # VaRank prioritization score mode
7587                                    if prioritization_score_mode.upper().strip() in [
7588                                        "VARANK",
7589                                        "MAX",
7590                                        "MAXIMUM",
7591                                        "TOP",
7592                                    ]:
7593                                        sql_set.append(
7594                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END "
7595                                        )
7596                                    # default HOWARD prioritization score mode
7597                                    else:
7598                                        sql_set.append(
7599                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7600                                        )
7601
7602                                # PZFlag
7603                                if (
7604                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7605                                    in list_of_pzfields
7606                                ):
7607                                    sql_set.append(
7608                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7609                                    )
7610
7611                                # PZClass
7612                                if (
7613                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7614                                    in list_of_pzfields
7615                                    and criterion_class is not None
7616                                ):
7617                                    sql_set.append(
7618                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7619                                    )
7620
7621                                # PZComment
7622                                if (
7623                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7624                                    in list_of_pzfields
7625                                ):
7626                                    sql_set.append(
7627                                        f"""
7628                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7629                                                concat(
7630                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7631                                                    CASE 
7632                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7633                                                        THEN ', '
7634                                                        ELSE ''
7635                                                    END,
7636                                                    '{criterion_comment}'
7637                                                )
7638                                        """
7639                                    )
7640
7641                                # PZInfos
7642                                if (
7643                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7644                                    in list_of_pzfields
7645                                ):
7646                                    sql_set.append(
7647                                        f"""
7648                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7649                                                concat(
7650                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7651                                                    '{criterion_infos}'
7652                                                )
7653                                        """
7654                                    )
7655                                sql_set_option = ",".join(sql_set)
7656
7657                                # Criterion and comparison
7658                                if sql_set_option:
7659
7660                                    # Operation mode
7661                                    if criterion_mode in ["operation"]:
7662
7663                                        # Check if value is a float
7664                                        try:
7665                                            float(criterion_value)
7666                                            sql_update = f"""
7667                                                UPDATE "{table_variants}"
7668                                                SET {sql_set_option}
7669                                                FROM (
7670                                                    SELECT *
7671                                                    FROM "{annotation_view_name}"
7672                                                    WHERE (
7673                                                        CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7674                                                        AND   CAST("{annotation_view_name}"."{annotation_view_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7675                                                        )
7676                                                    ) AS "{annotation_view_name}"
7677                                                WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
7678                                                  AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
7679                                                  AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
7680                                                  AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
7681                                                
7682                                            """
7683                                        # If not a floatÃ’
7684                                        except:
7685                                            contains_option = ""
7686                                            if criterion_type == "contains":
7687                                                contains_option = ".*"
7688                                            sql_update = f"""
7689                                                UPDATE "{table_variants}"
7690                                                SET {sql_set_option}
7691                                                FROM (
7692                                                    SELECT *
7693                                                    FROM "{annotation_view_name}"
7694                                                    WHERE (
7695                                                        "{annotation_view_name}"."{annotation_view_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7696                                                        )
7697                                                    ) AS "{annotation_view_name}"
7698                                                WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
7699                                                  AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
7700                                                  AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
7701                                                  AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
7702                                                  
7703                                            """
7704                                        sql_queries.append(sql_update)
7705
7706                                    # SQL mode
7707                                    elif criterion_mode in ["sql"]:
7708
7709                                        sql_update = f"""
7710                                            UPDATE {table_variants}
7711                                            SET {sql_set_option}
7712                                            FROM (
7713                                                SELECT *
7714                                                FROM "{annotation_view_name}"
7715                                                WHERE ({criterion_sql})
7716                                                ) AS "{annotation_view_name}"
7717                                            WHERE "{table_variants}"."#CHROM" == "{annotation_view_name}"."#CHROM"
7718                                                AND "{table_variants}"."POS" == "{annotation_view_name}"."POS"
7719                                                AND "{table_variants}"."REF" == "{annotation_view_name}"."REF"
7720                                                AND "{table_variants}"."ALT" == "{annotation_view_name}"."ALT" 
7721                                        """
7722                                        sql_queries.append(sql_update)
7723
7724                                    else:
7725                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7726                                        log.error(msg_err)
7727                                        raise ValueError(msg_err)
7728
7729                                else:
7730                                    log.warning(
7731                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7732                                    )
7733
7734                        # PZTags
7735                        if (
7736                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7737                            in list_of_pzfields
7738                        ):
7739
7740                            # Create PZFalgs value
7741                            pztags_value = ""
7742                            pztags_sep_default = ","
7743                            pztags_sep = ""
7744                            for pzfield in pzfields:
7745                                if pzfield not in [f"{pz_prefix}Tags"]:
7746                                    if (
7747                                        f"{pzfield}{pzfields_sep}{profile}"
7748                                        in list_of_pzfields
7749                                    ):
7750                                        if pzfield in [f"{pz_prefix}Flag"]:
7751                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7752                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7753                                                    THEN 'PASS'
7754                                                    ELSE 'FILTERED'
7755                                                END, '"""
7756                                        elif pzfield in [f"{pz_prefix}Class"]:
7757                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7758                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7759                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7760                                                    ELSE '.'
7761                                                END, '"""
7762                                        else:
7763                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7764                                        pztags_sep = pztags_sep_default
7765
7766                            # Add Query update for PZFlags
7767                            sql_update_pztags = f"""
7768                                UPDATE {table_variants}
7769                                SET INFO = concat(
7770                                        INFO,
7771                                        CASE WHEN INFO NOT in ('','.')
7772                                                THEN ';'
7773                                                ELSE ''
7774                                        END,
7775                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7776                                    )
7777                                WHERE 1=1
7778                                """
7779                            sql_queries.append(sql_update_pztags)
7780
7781                            # Add Query update for PZFlags for default
7782                            if profile == default_profile:
7783                                sql_update_pztags_default = f"""
7784                                UPDATE {table_variants}
7785                                SET INFO = concat(
7786                                        INFO,
7787                                        ';',
7788                                        '{pz_prefix}Tags={pztags_value}'
7789                                    )
7790                                    WHERE 1=1
7791                                """
7792                                sql_queries.append(sql_update_pztags_default)
7793
7794                        log.info(f"""Profile '{profile}' - Prioritization... """)
7795
7796                        # Create annotations view for prioritization
7797                        log.debug(
7798                            f"""Profile '{profile}' - Prioritization - Create '{annotation_view_name}' view with '{criterion_fields_profile}'... """
7799                        )
7800                        annotation_view = self.create_annotations_view(
7801                            view=annotation_view_name,
7802                            prefix=annotation_view_prefix,
7803                            fields=criterion_fields_profile,
7804                            drop_view=True,
7805                        )
7806
7807                        # Chromosomes list
7808                        sql_uniq_chrom = f"""
7809                            SELECT DISTINCT "#CHROM"
7810                            FROM {table_variants}
7811                        """
7812                        chroms = self.get_query_to_df(sql_uniq_chrom)["#CHROM"].tolist()
7813
7814                        for chrom in chroms:
7815
7816                            log.debug(
7817                                f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}'... """
7818                            )
7819
7820                            if sql_queries:
7821
7822                                # Query num
7823                                num_query = 0
7824
7825                                # For each query
7826                                for sql_query in sql_queries:
7827
7828                                    # Query num
7829                                    num_query += 1
7830
7831                                    sql_query_chrom = f"""
7832                                        {sql_query}
7833                                        AND {table_variants}."#CHROM" LIKE '{chrom}' 
7834                                    """
7835                                    log.debug(
7836                                        f"""Profile '{profile}' - Prioritization query - Chromosome '{chrom}' [{num_query}/{len(sql_queries)}]"""
7837                                    )
7838                                    # log.debug(f"""sql_query_chrom: {sql_query_chrom}""")
7839                                    self.execute_query(query=sql_query_chrom)
7840
7841                        # Update INFO field
7842                        log.info(f"""Profile '{profile}' - Update... """)
7843                        sql_query_update = f"""
7844                            UPDATE {table_variants}
7845                            SET INFO =  
7846                                concat(
7847                                    CASE
7848                                        WHEN INFO NOT IN ('','.')
7849                                        THEN concat(INFO, ';')
7850                                        ELSE ''
7851                                    END
7852                                    {sql_set_info_option}
7853                                )
7854                        """
7855                        # log.debug(f"sql_query_update={sql_query_update}")
7856                        self.execute_query(query=sql_query_update)
7857
7858                        # Remove annotations view for prioritization
7859                        query_drop_tmp_table = f"""
7860                            DROP VIEW IF EXISTS {annotation_view_name}
7861                        """
7862                        self.execute_query(query=query_drop_tmp_table)
7863
7864        else:
7865
7866            log.warning(f"No profiles in parameters")
7867
7868        # Remove added columns
7869        for added_column in added_columns:
7870            self.drop_column(column=added_column)
7871
7872        # Explode INFOS fields into table fields
7873        if self.get_explode_infos():
7874            self.explode_infos(
7875                prefix=self.get_explode_infos_prefix(),
7876                fields=self.get_explode_infos_fields(),
7877                force=True,
7878            )
7879
7880        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7886    def annotation_hgvs(self, threads: int = None) -> None:
7887        """
7888        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7889        coordinates and alleles.
7890
7891        :param threads: The `threads` parameter is an optional integer that specifies the number of
7892        threads to use for parallel processing. If no value is provided, it will default to the number
7893        of threads obtained from the `get_threads()` method
7894        :type threads: int
7895        """
7896
7897        # Function for each partition of the Dask Dataframe
7898        def partition_function(partition):
7899            """
7900            The function `partition_function` applies the `annotation_hgvs_partition` function to
7901            each row of a DataFrame called `partition`.
7902
7903            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7904            to be processed
7905            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7906            the "partition" dataframe along the axis 1.
7907            """
7908            return partition.apply(annotation_hgvs_partition, axis=1)
7909
7910        def annotation_hgvs_partition(row) -> str:
7911            """
7912            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7913            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7914
7915            :param row: A dictionary-like object that contains the values for the following keys:
7916            :return: a string that contains the HGVS names associated with the given row of data.
7917            """
7918
7919            chr = row["CHROM"]
7920            pos = row["POS"]
7921            ref = row["REF"]
7922            alt = row["ALT"]
7923
7924            # Find list of associated transcripts
7925            transcripts_list = list(
7926                polars_conn.execute(
7927                    f"""
7928                SELECT transcript
7929                FROM refseq_df
7930                WHERE CHROM='{chr}'
7931                AND POS={pos}
7932            """
7933                )["transcript"]
7934            )
7935
7936            # Full HGVS annotation in list
7937            hgvs_full_list = []
7938
7939            for transcript_name in transcripts_list:
7940
7941                # Transcript
7942                transcript = get_transcript(
7943                    transcripts=transcripts, transcript_name=transcript_name
7944                )
7945                # Exon
7946                if use_exon:
7947                    exon = transcript.find_exon_number(pos)
7948                else:
7949                    exon = None
7950                # Protein
7951                transcript_protein = None
7952                if use_protein or add_protein or full_format:
7953                    transcripts_protein = list(
7954                        polars_conn.execute(
7955                            f"""
7956                        SELECT protein
7957                        FROM refseqlink_df
7958                        WHERE transcript='{transcript_name}'
7959                        LIMIT 1
7960                    """
7961                        )["protein"]
7962                    )
7963                    if len(transcripts_protein):
7964                        transcript_protein = transcripts_protein[0]
7965
7966                # HGVS name
7967                hgvs_name = format_hgvs_name(
7968                    chr,
7969                    pos,
7970                    ref,
7971                    alt,
7972                    genome=genome,
7973                    transcript=transcript,
7974                    transcript_protein=transcript_protein,
7975                    exon=exon,
7976                    use_gene=use_gene,
7977                    use_protein=use_protein,
7978                    full_format=full_format,
7979                    use_version=use_version,
7980                    codon_type=codon_type,
7981                )
7982                hgvs_full_list.append(hgvs_name)
7983                if add_protein and not use_protein and not full_format:
7984                    hgvs_name = format_hgvs_name(
7985                        chr,
7986                        pos,
7987                        ref,
7988                        alt,
7989                        genome=genome,
7990                        transcript=transcript,
7991                        transcript_protein=transcript_protein,
7992                        exon=exon,
7993                        use_gene=use_gene,
7994                        use_protein=True,
7995                        full_format=False,
7996                        use_version=use_version,
7997                        codon_type=codon_type,
7998                    )
7999                    hgvs_full_list.append(hgvs_name)
8000
8001            # Create liste of HGVS annotations
8002            hgvs_full = ",".join(hgvs_full_list)
8003
8004            return hgvs_full
8005
8006        # Polars connexion
8007        polars_conn = pl.SQLContext(register_globals=True, eager=True)
8008
8009        # Config
8010        config = self.get_config()
8011
8012        # Databases
8013        # Genome
8014        databases_genomes_folders = (
8015            config.get("folders", {})
8016            .get("databases", {})
8017            .get("genomes", DEFAULT_GENOME_FOLDER)
8018        )
8019        databases_genome = (
8020            config.get("folders", {}).get("databases", {}).get("genomes", "")
8021        )
8022        # refseq database folder
8023        databases_refseq_folders = (
8024            config.get("folders", {})
8025            .get("databases", {})
8026            .get("refseq", DEFAULT_REFSEQ_FOLDER)
8027        )
8028        # refseq
8029        databases_refseq = config.get("databases", {}).get("refSeq", None)
8030        # refSeqLink
8031        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
8032
8033        # Param
8034        param = self.get_param()
8035
8036        # Quick HGVS
8037        if "hgvs_options" in param and param.get("hgvs_options", ""):
8038            log.info(f"Quick HGVS Annotation:")
8039            if not param.get("hgvs", None):
8040                param["hgvs"] = {}
8041            for option in param.get("hgvs_options", "").split(","):
8042                option_var_val = option.split("=")
8043                option_var = option_var_val[0]
8044                if len(option_var_val) > 1:
8045                    option_val = option_var_val[1]
8046                else:
8047                    option_val = "True"
8048                if option_val.upper() in ["TRUE"]:
8049                    option_val = True
8050                elif option_val.upper() in ["FALSE"]:
8051                    option_val = False
8052                log.info(f"   {option_var}={option_val}")
8053                param["hgvs"][option_var] = option_val
8054
8055        # Check if HGVS annotation enabled
8056        if "hgvs" in param:
8057            log.info(f"HGVS Annotation... ")
8058            for hgvs_option in param.get("hgvs", {}):
8059                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
8060        else:
8061            return
8062
8063        # HGVS Param
8064        param_hgvs = param.get("hgvs", {})
8065        use_exon = param_hgvs.get("use_exon", False)
8066        use_gene = param_hgvs.get("use_gene", False)
8067        use_protein = param_hgvs.get("use_protein", False)
8068        add_protein = param_hgvs.get("add_protein", False)
8069        full_format = param_hgvs.get("full_format", False)
8070        use_version = param_hgvs.get("use_version", False)
8071        codon_type = param_hgvs.get("codon_type", "3")
8072
8073        # refSseq refSeqLink
8074        databases_refseq = param_hgvs.get("refseq", databases_refseq)
8075        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
8076
8077        # Assembly
8078        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
8079
8080        # Genome
8081        genome_file = None
8082        if find_genome(databases_genome):
8083            genome_file = find_genome(databases_genome)
8084        else:
8085            genome_file = find_genome(
8086                genome_path=databases_genomes_folders, assembly=assembly
8087            )
8088        log.debug("Genome: " + str(genome_file))
8089
8090        # refSseq
8091        refseq_file = find_file_prefix(
8092            input_file=databases_refseq,
8093            prefix="ncbiRefSeq",
8094            folder=databases_refseq_folders,
8095            assembly=assembly,
8096        )
8097        log.debug("refSeq: " + str(refseq_file))
8098
8099        # refSeqLink
8100        refseqlink_file = find_file_prefix(
8101            input_file=databases_refseqlink,
8102            prefix="ncbiRefSeqLink",
8103            folder=databases_refseq_folders,
8104            assembly=assembly,
8105        )
8106        log.debug("refSeqLink: " + str(refseqlink_file))
8107
8108        # Threads
8109        if not threads:
8110            threads = self.get_threads()
8111        log.debug("Threads: " + str(threads))
8112
8113        # Variables
8114        table_variants = self.get_table_variants(clause="update")
8115
8116        # Get variants SNV and InDel only
8117        query_variants = f"""
8118            SELECT "#CHROM" AS CHROM, POS, REF, ALT
8119            FROM {table_variants}
8120            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
8121            """
8122        df_variants = self.get_query_to_df(query_variants)
8123
8124        # Added columns
8125        added_columns = []
8126
8127        # Add hgvs column in variants table
8128        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
8129        added_column = self.add_column(
8130            table_variants, hgvs_column_name, "STRING", default_value=None
8131        )
8132        added_columns.append(added_column)
8133
8134        log.debug(f"refSeq loading...")
8135        # refSeq in duckDB
8136        refseq_table = get_refseq_table(
8137            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
8138        )
8139        # Loading all refSeq in Dataframe
8140        refseq_query = f"""
8141            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
8142            FROM {refseq_table}
8143            JOIN df_variants ON (
8144                {refseq_table}.chrom = df_variants.CHROM
8145                AND {refseq_table}.txStart<=df_variants.POS
8146                AND {refseq_table}.txEnd>=df_variants.POS
8147            )
8148        """
8149        refseq_df = self.conn.query(refseq_query).pl()
8150
8151        if refseqlink_file:
8152            log.debug(f"refSeqLink loading...")
8153            # refSeqLink in duckDB
8154            refseqlink_table = get_refseq_table(
8155                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
8156            )
8157            # Loading all refSeqLink in Dataframe
8158            protacc_column = "protAcc_with_ver"
8159            mrnaacc_column = "mrnaAcc_with_ver"
8160            refseqlink_query = f"""
8161                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
8162                FROM {refseqlink_table} 
8163                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
8164                WHERE protAcc_without_ver IS NOT NULL
8165            """
8166            # Polars Dataframe
8167            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
8168
8169        # Read RefSeq transcripts into a python dict/model.
8170        log.debug(f"Transcripts loading...")
8171        with tempfile.TemporaryDirectory() as tmpdir:
8172            transcripts_query = f"""
8173                COPY (
8174                    SELECT {refseq_table}.*
8175                    FROM {refseq_table}
8176                    JOIN df_variants ON (
8177                        {refseq_table}.chrom=df_variants.CHROM
8178                        AND {refseq_table}.txStart<=df_variants.POS
8179                        AND {refseq_table}.txEnd>=df_variants.POS
8180                    )
8181                )
8182                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
8183            """
8184            self.conn.query(transcripts_query)
8185            with open(f"{tmpdir}/transcript.tsv") as infile:
8186                transcripts = read_transcripts(infile)
8187
8188        # Polars connexion
8189        polars_conn = pl.SQLContext(register_globals=True, eager=True)
8190
8191        log.debug("Genome loading...")
8192        # Read genome sequence using pyfaidx.
8193        genome = Fasta(genome_file)
8194
8195        log.debug("Start annotation HGVS...")
8196
8197        # Create
8198        # a Dask Dataframe from Pandas dataframe with partition as number of threads
8199        ddf = dd.from_pandas(df_variants, npartitions=threads)
8200
8201        # Use dask.dataframe.apply() to apply function on each partition
8202        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
8203
8204        # Convert Dask DataFrame to Pandas Dataframe
8205        df = ddf.compute()
8206
8207        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
8208        with tempfile.TemporaryDirectory() as tmpdir:
8209            df_parquet = os.path.join(tmpdir, "df.parquet")
8210            df.to_parquet(df_parquet)
8211
8212            # Update hgvs column
8213            update_variant_query = f"""
8214                UPDATE {table_variants}
8215                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
8216                FROM read_parquet('{df_parquet}') as df
8217                WHERE variants."#CHROM" = df.CHROM
8218                AND variants.POS = df.POS
8219                AND variants.REF = df.REF
8220                AND variants.ALT = df.ALT
8221                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
8222                """
8223            self.execute_query(update_variant_query)
8224
8225        # Update INFO column
8226        sql_query_update = f"""
8227            UPDATE {table_variants}
8228            SET INFO = 
8229                concat(
8230                    CASE 
8231                        WHEN INFO NOT IN ('','.')
8232                        THEN concat(INFO, ';')
8233                        ELSE ''
8234                    END,
8235                    'hgvs=',
8236                    {hgvs_column_name}
8237                )
8238            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
8239            """
8240        self.execute_query(sql_query_update)
8241
8242        # Add header
8243        HGVS_INFOS = {
8244            "hgvs": {
8245                "ID": "hgvs",
8246                "Number": ".",
8247                "Type": "String",
8248                "Description": f"HGVS annotatation with HOWARD",
8249            }
8250        }
8251
8252        for field in HGVS_INFOS:
8253            field_ID = HGVS_INFOS[field]["ID"]
8254            field_description = HGVS_INFOS[field]["Description"]
8255            self.get_header().infos[field_ID] = vcf.parser._Info(
8256                field_ID,
8257                HGVS_INFOS[field]["Number"],
8258                HGVS_INFOS[field]["Type"],
8259                field_description,
8260                "unknown",
8261                "unknown",
8262                code_type_map[HGVS_INFOS[field]["Type"]],
8263            )
8264
8265        # Remove added columns
8266        for added_column in added_columns:
8267            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
8273    def get_operations_help(
8274        self, operations_config_dict: dict = {}, operations_config_file: str = None
8275    ) -> list:
8276
8277        # Init
8278        operations_help = []
8279
8280        # operations
8281        operations = self.get_config_json(
8282            name="calculations",
8283            config_dict=operations_config_dict,
8284            config_file=operations_config_file,
8285        )
8286        for op in operations:
8287            op_name = operations[op].get("name", op).upper()
8288            op_description = operations[op].get("description", op_name)
8289            op_available = operations[op].get("available", False)
8290            if op_available:
8291                operations_help.append(f"   {op_name}: {op_description}")
8292
8293        # Sort operations
8294        operations_help.sort()
8295
8296        # insert header
8297        operations_help.insert(0, "Available calculation operations:")
8298
8299        # Return
8300        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
8302    def calculation(
8303        self,
8304        operations: dict = {},
8305        operations_config_dict: dict = {},
8306        operations_config_file: str = None,
8307    ) -> None:
8308        """
8309        It takes a list of operations, and for each operation, it checks if it's a python or sql
8310        operation, and then calls the appropriate function
8311
8312        param json example:
8313            "calculation": {
8314                "NOMEN": {
8315                    "options": {
8316                        "hgvs_field": "hgvs"
8317                    },
8318                "middle" : null
8319            }
8320        """
8321
8322        # Param
8323        param = self.get_param()
8324
8325        # CHeck operations config file
8326        if operations_config_file is None:
8327            operations_config_file = param.get("calculation", {}).get(
8328                "calculation_config", None
8329            )
8330
8331        # operations config
8332        operations_config = self.get_config_json(
8333            name="calculations",
8334            config_dict=operations_config_dict,
8335            config_file=operations_config_file,
8336        )
8337
8338        # Upper keys
8339        operations_config = {k.upper(): v for k, v in operations_config.items()}
8340
8341        # Calculations
8342
8343        # Operations from param
8344        operations = param.get("calculation", {}).get("calculations", operations)
8345
8346        # Quick calculation - add
8347        if param.get("calculations", None):
8348
8349            # List of operations
8350            calculations_list = [
8351                value.strip() for value in param.get("calculations", "").split(",")
8352            ]
8353
8354            # Log
8355            log.info(f"Quick Calculations:")
8356            for calculation_key in calculations_list:
8357                log.info(f"   {calculation_key}")
8358
8359            # Create tmp operations (to keep operation order)
8360            operations_tmp = {}
8361            for calculation_operation in calculations_list:
8362                if calculation_operation.upper() not in operations_tmp:
8363                    log.debug(
8364                        f"{calculation_operation}.upper() not in {operations_tmp}"
8365                    )
8366                    operations_tmp[calculation_operation.upper()] = {}
8367                    add_value_into_dict(
8368                        dict_tree=operations_tmp,
8369                        sections=[
8370                            calculation_operation.upper(),
8371                        ],
8372                        value=operations.get(calculation_operation.upper(), {}),
8373                    )
8374            # Add operations already in param
8375            for calculation_operation in operations:
8376                if calculation_operation not in operations_tmp:
8377                    operations_tmp[calculation_operation] = operations.get(
8378                        calculation_operation, {}
8379                    )
8380
8381            # Update operations in param
8382            operations = operations_tmp
8383
8384        # Operations for calculation
8385        if not operations:
8386            operations = param.get("calculation", {}).get("calculations", {})
8387
8388        if operations:
8389            log.info(f"Calculations...")
8390
8391        # For each operations
8392        for operation_name in operations:
8393            operation_name = operation_name.upper()
8394            if operation_name not in [""]:
8395                if operation_name in operations_config:
8396                    log.info(f"Calculation '{operation_name}'")
8397                    operation = operations_config[operation_name]
8398                    operation_type = operation.get("type", "sql")
8399                    if operation_type == "python":
8400                        self.calculation_process_function(
8401                            operation=operation, operation_name=operation_name
8402                        )
8403                    elif operation_type == "sql":
8404                        self.calculation_process_sql(
8405                            operation=operation, operation_name=operation_name
8406                        )
8407                    else:
8408                        log.error(
8409                            f"Operations config: Type '{operation_type}' NOT available"
8410                        )
8411                        raise ValueError(
8412                            f"Operations config: Type '{operation_type}' NOT available"
8413                        )
8414                else:
8415                    log.error(
8416                        f"Operations config: Calculation '{operation_name}' NOT available"
8417                    )
8418                    raise ValueError(
8419                        f"Operations config: Calculation '{operation_name}' NOT available"
8420                    )
8421
8422        # Explode INFOS fields into table fields
8423        if self.get_explode_infos():
8424            self.explode_infos(
8425                prefix=self.get_explode_infos_prefix(),
8426                fields=self.get_explode_infos_fields(),
8427                force=True,
8428            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
8430    def calculation_process_sql(
8431        self, operation: dict, operation_name: str = "unknown"
8432    ) -> None:
8433        """
8434        The `calculation_process_sql` function takes in a mathematical operation as a string and
8435        performs the operation, updating the specified table with the result.
8436
8437        :param operation: The `operation` parameter is a dictionary that contains information about the
8438        mathematical operation to be performed. It includes the following keys:
8439        :type operation: dict
8440        :param operation_name: The `operation_name` parameter is a string that represents the name of
8441        the mathematical operation being performed. It is used for logging and error handling purposes,
8442        defaults to unknown
8443        :type operation_name: str (optional)
8444        """
8445
8446        # Operation infos
8447        operation_name = operation.get("name", "unknown")
8448        log.debug(f"process SQL {operation_name}")
8449        output_column_name = operation.get("output_column_name", operation_name)
8450        output_column_type = operation.get("output_column_type", "String")
8451        prefix = operation.get("explode_infos_prefix", "")
8452        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
8453        output_column_description = operation.get(
8454            "output_column_description", f"{operation_name} operation"
8455        )
8456        operation_query = operation.get("operation_query", None)
8457        if isinstance(operation_query, list):
8458            operation_query = " ".join(operation_query)
8459        operation_info_fields = operation.get("info_fields", [])
8460        operation_info_fields_check = operation.get("info_fields_check", False)
8461        operation_info = operation.get("operation_info", True)
8462        operation_table = operation.get(
8463            "table", self.get_table_variants(clause="alter")
8464        )
8465
8466        # table variants
8467        if operation_table:
8468            table_variants = operation_table
8469        else:
8470            table_variants = self.get_table_variants(clause="alter")
8471
8472        if operation_query:
8473
8474            # Info fields check
8475            operation_info_fields_check_result = True
8476            if operation_info_fields_check:
8477                header_infos = self.get_header().infos
8478                for info_field in operation_info_fields:
8479                    operation_info_fields_check_result = (
8480                        operation_info_fields_check_result
8481                        and info_field in header_infos
8482                    )
8483
8484            # If info fields available
8485            if operation_info_fields_check_result:
8486
8487                # Added_columns
8488                added_columns = []
8489
8490                # Create VCF header field
8491                vcf_reader = self.get_header()
8492                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8493                    output_column_name,
8494                    ".",
8495                    output_column_type,
8496                    output_column_description,
8497                    "howard calculation",
8498                    "0",
8499                    self.code_type_map.get(output_column_type),
8500                )
8501
8502                # Explode infos if needed
8503                log.debug(f"calculation_process_sql prefix {prefix}")
8504                added_columns += self.explode_infos(
8505                    prefix=prefix,
8506                    fields=[output_column_name] + operation_info_fields,
8507                    force=False,
8508                    table=table_variants,
8509                )
8510
8511                # Create column
8512                added_column = self.add_column(
8513                    table_name=table_variants,
8514                    column_name=prefix + output_column_name,
8515                    column_type=output_column_type_sql,
8516                    default_value="null",
8517                )
8518                added_columns.append(added_column)
8519
8520                # Operation calculation
8521                try:
8522
8523                    # Query to update calculation column
8524                    sql_update = f"""
8525                        UPDATE {table_variants}
8526                        SET "{prefix}{output_column_name}" = ({operation_query})
8527                    """
8528                    self.conn.execute(sql_update)
8529
8530                    # Add to INFO
8531                    if operation_info:
8532                        sql_update_info = f"""
8533                            UPDATE {table_variants}
8534                            SET "INFO" =
8535                                concat(
8536                                    CASE
8537                                        WHEN "INFO" IS NOT NULL
8538                                        THEN concat("INFO", ';')
8539                                        ELSE ''
8540                                    END,
8541                                    '{output_column_name}=',
8542                                    "{prefix}{output_column_name}"
8543                                )
8544                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8545                        """
8546                        self.conn.execute(sql_update_info)
8547
8548                except:
8549                    log.error(
8550                        f"Operations config: Calculation '{operation_name}' query failed"
8551                    )
8552                    raise ValueError(
8553                        f"Operations config: Calculation '{operation_name}' query failed"
8554                    )
8555
8556                # Remove added columns
8557                for added_column in added_columns:
8558                    log.debug(f"added_column: {added_column}")
8559                    self.drop_column(column=added_column)
8560
8561            else:
8562                log.error(
8563                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8564                )
8565                raise ValueError(
8566                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8567                )
8568
8569        else:
8570            log.error(
8571                f"Operations config: Calculation '{operation_name}' query NOT defined"
8572            )
8573            raise ValueError(
8574                f"Operations config: Calculation '{operation_name}' query NOT defined"
8575            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8577    def calculation_process_function(
8578        self, operation: dict, operation_name: str = "unknown"
8579    ) -> None:
8580        """
8581        The `calculation_process_function` takes in an operation dictionary and performs the specified
8582        function with the given parameters.
8583
8584        :param operation: The `operation` parameter is a dictionary that contains information about the
8585        operation to be performed. It has the following keys:
8586        :type operation: dict
8587        :param operation_name: The `operation_name` parameter is a string that represents the name of
8588        the operation being performed. It is used for logging purposes, defaults to unknown
8589        :type operation_name: str (optional)
8590        """
8591
8592        operation_name = operation["name"]
8593        log.debug(f"process Python {operation_name}")
8594        function_name = operation["function_name"]
8595        function_params = operation["function_params"]
8596        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8598    def calculation_variant_id(self) -> None:
8599        """
8600        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8601        updates the INFO field of a variants table with the variant ID.
8602        """
8603
8604        # variant_id annotation field
8605        variant_id_tag = self.get_variant_id_column()
8606        added_columns = [variant_id_tag]
8607
8608        # variant_id hgvs tags"
8609        vcf_infos_tags = {
8610            variant_id_tag: "howard variant ID annotation",
8611        }
8612
8613        # Variants table
8614        table_variants = self.get_table_variants()
8615
8616        # Header
8617        vcf_reader = self.get_header()
8618
8619        # Add variant_id to header
8620        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8621            variant_id_tag,
8622            ".",
8623            "String",
8624            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8625            "howard calculation",
8626            "0",
8627            self.code_type_map.get("String"),
8628        )
8629
8630        # Update
8631        sql_update = f"""
8632            UPDATE {table_variants}
8633            SET "INFO" = 
8634                concat(
8635                    CASE
8636                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8637                        THEN ''
8638                        ELSE concat("INFO", ';')
8639                    END,
8640                    '{variant_id_tag}=',
8641                    "{variant_id_tag}"
8642                )
8643        """
8644        self.conn.execute(sql_update)
8645
8646        # Remove added columns
8647        for added_column in added_columns:
8648            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8650    def calculation_extract_snpeff_hgvs(
8651        self,
8652        snpeff_hgvs: str = "snpeff_hgvs",
8653        snpeff_field: str = "ANN",
8654    ) -> None:
8655        """
8656        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8657        annotation field in a VCF file and adds them as a new column in the variants table.
8658
8659        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8660        function is used to specify the name of the column that will store the HGVS nomenclatures
8661        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8662        snpeff_hgvs
8663        :type snpeff_hgvs: str (optional)
8664        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8665        function represents the field in the VCF file that contains SnpEff annotations. This field is
8666        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8667        to ANN
8668        :type snpeff_field: str (optional)
8669        """
8670
8671        # Snpeff hgvs tags
8672        vcf_infos_tags = {
8673            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8674        }
8675
8676        # Prefix
8677        prefix = self.get_explode_infos_prefix()
8678        if prefix:
8679            prefix = "INFO/"
8680
8681        # snpEff fields
8682        speff_ann_infos = prefix + snpeff_field
8683        speff_hgvs_infos = prefix + snpeff_hgvs
8684
8685        # Variants table
8686        table_variants = self.get_table_variants()
8687
8688        # Header
8689        vcf_reader = self.get_header()
8690
8691        # Add columns
8692        added_columns = []
8693
8694        # Explode HGVS field in column
8695        added_columns += self.explode_infos(fields=[snpeff_field])
8696
8697        if snpeff_field in vcf_reader.infos:
8698
8699            log.debug(vcf_reader.infos[snpeff_field])
8700
8701            # Extract ANN header
8702            ann_description = vcf_reader.infos[snpeff_field].desc
8703            pattern = r"'(.+?)'"
8704            match = re.search(pattern, ann_description)
8705            if match:
8706                ann_header_match = match.group(1).split(" | ")
8707                ann_header_desc = {}
8708                for i in range(len(ann_header_match)):
8709                    ann_header_info = "".join(
8710                        char for char in ann_header_match[i] if char.isalnum()
8711                    )
8712                    ann_header_desc[ann_header_info] = ann_header_match[i]
8713                if not ann_header_desc:
8714                    raise ValueError("Invalid header description format")
8715            else:
8716                raise ValueError("Invalid header description format")
8717
8718            # Create variant id
8719            variant_id_column = self.get_variant_id_column()
8720            added_columns += [variant_id_column]
8721
8722            # Create dataframe
8723            dataframe_snpeff_hgvs = self.get_query_to_df(
8724                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8725            )
8726
8727            # Create main NOMEN column
8728            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8729                speff_ann_infos
8730            ].apply(
8731                lambda x: extract_snpeff_hgvs(
8732                    str(x), header=list(ann_header_desc.values())
8733                )
8734            )
8735
8736            # Add snpeff_hgvs to header
8737            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8738                snpeff_hgvs,
8739                ".",
8740                "String",
8741                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8742                "howard calculation",
8743                "0",
8744                self.code_type_map.get("String"),
8745            )
8746
8747            # Update
8748            sql_update = f"""
8749                UPDATE variants
8750                SET "INFO" = 
8751                    concat(
8752                        CASE
8753                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8754                            THEN ''
8755                            ELSE concat("INFO", ';')
8756                        END,
8757                        CASE 
8758                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8759                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8760                            THEN concat(
8761                                    '{snpeff_hgvs}=',
8762                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8763                                )
8764                            ELSE ''
8765                        END
8766                    )
8767                FROM dataframe_snpeff_hgvs
8768                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8769
8770            """
8771            self.conn.execute(sql_update)
8772
8773            # Delete dataframe
8774            del dataframe_snpeff_hgvs
8775            gc.collect()
8776
8777        else:
8778
8779            log.warning(
8780                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8781            )
8782
8783        # Remove added columns
8784        for added_column in added_columns:
8785            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8787    def calculation_snpeff_ann_explode(
8788        self,
8789        uniquify: bool = True,
8790        output_format: str = "fields",
8791        output_prefix: str = "snpeff_",
8792        snpeff_field: str = "ANN",
8793    ) -> None:
8794        """
8795        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8796        exploding the HGVS field and updating variant information accordingly.
8797
8798        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8799        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8800        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8801        defaults to True
8802        :type uniquify: bool (optional)
8803        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8804        function specifies the format in which the output annotations will be generated. It has a
8805        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8806        format, defaults to fields
8807        :type output_format: str (optional)
8808        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8809        method is used to specify the prefix that will be added to the output annotations generated
8810        during the calculation process. This prefix helps to differentiate the newly added annotations
8811        from existing ones in the output data. By default, the, defaults to ANN_
8812        :type output_prefix: str (optional)
8813        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8814        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8815        field will be processed to explode the HGVS annotations and update the variant information
8816        accordingly, defaults to ANN
8817        :type snpeff_field: str (optional)
8818        """
8819
8820        # SnpEff annotation field
8821        snpeff_hgvs = "snpeff_ann_explode"
8822
8823        # Snpeff hgvs tags
8824        vcf_infos_tags = {
8825            snpeff_hgvs: "Explode snpEff annotations",
8826        }
8827
8828        # Prefix
8829        prefix = self.get_explode_infos_prefix()
8830        if prefix:
8831            prefix = "INFO/"
8832
8833        # snpEff fields
8834        speff_ann_infos = prefix + snpeff_field
8835        speff_hgvs_infos = prefix + snpeff_hgvs
8836
8837        # Variants table
8838        table_variants = self.get_table_variants()
8839
8840        # Header
8841        vcf_reader = self.get_header()
8842
8843        # Add columns
8844        added_columns = []
8845
8846        # Explode HGVS field in column
8847        added_columns += self.explode_infos(fields=[snpeff_field])
8848        log.debug(f"snpeff_field={snpeff_field}")
8849        log.debug(f"added_columns={added_columns}")
8850
8851        if snpeff_field in vcf_reader.infos:
8852
8853            # Extract ANN header
8854            ann_description = vcf_reader.infos[snpeff_field].desc
8855            pattern = r"'(.+?)'"
8856            match = re.search(pattern, ann_description)
8857            if match:
8858                ann_header_match = match.group(1).split(" | ")
8859                ann_header = []
8860                ann_header_desc = {}
8861                for i in range(len(ann_header_match)):
8862                    ann_header_info = "".join(
8863                        char for char in ann_header_match[i] if char.isalnum()
8864                    )
8865                    ann_header.append(ann_header_info)
8866                    ann_header_desc[ann_header_info] = ann_header_match[i]
8867                if not ann_header_desc:
8868                    raise ValueError("Invalid header description format")
8869            else:
8870                raise ValueError("Invalid header description format")
8871
8872            # Create variant id
8873            variant_id_column = self.get_variant_id_column()
8874            added_columns += [variant_id_column]
8875
8876            # Create dataframe
8877            dataframe_snpeff_hgvs = self.get_query_to_df(
8878                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8879            )
8880
8881            # Create snpEff columns
8882            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8883                speff_ann_infos
8884            ].apply(
8885                lambda x: explode_snpeff_ann(
8886                    str(x),
8887                    uniquify=uniquify,
8888                    output_format=output_format,
8889                    prefix=output_prefix,
8890                    header=list(ann_header_desc.values()),
8891                )
8892            )
8893
8894            # Header
8895            ann_annotations_prefix = ""
8896            if output_format.upper() in ["JSON"]:
8897                ann_annotations_prefix = f"{output_prefix}="
8898                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8899                    output_prefix,
8900                    ".",
8901                    "String",
8902                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8903                    + " - JSON format",
8904                    "howard calculation",
8905                    "0",
8906                    self.code_type_map.get("String"),
8907                )
8908            else:
8909                for ann_annotation in ann_header:
8910                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8911                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8912                        ann_annotation_id,
8913                        ".",
8914                        "String",
8915                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8916                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8917                        "howard calculation",
8918                        "0",
8919                        self.code_type_map.get("String"),
8920                    )
8921
8922            # Update
8923            sql_update = f"""
8924                UPDATE variants
8925                SET "INFO" = 
8926                    concat(
8927                        CASE
8928                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8929                            THEN ''
8930                            ELSE concat("INFO", ';')
8931                        END,
8932                        CASE 
8933                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8934                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8935                            THEN concat(
8936                                '{ann_annotations_prefix}',
8937                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8938                                )
8939                            ELSE ''
8940                        END
8941                    )
8942                FROM dataframe_snpeff_hgvs
8943                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8944
8945            """
8946            self.conn.execute(sql_update)
8947
8948            # Delete dataframe
8949            del dataframe_snpeff_hgvs
8950            gc.collect()
8951
8952        else:
8953
8954            log.warning(
8955                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8956            )
8957
8958        # Remove added columns
8959        for added_column in added_columns:
8960            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8962    def calculation_extract_nomen(self) -> None:
8963        """
8964        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8965        """
8966
8967        # NOMEN field
8968        field_nomen_dict = "NOMEN_DICT"
8969
8970        # NOMEN structure
8971        nomen_dict = {
8972            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8973            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8974            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8975            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8976            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8977            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8978            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8979            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8980            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8981            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8982        }
8983
8984        # Param
8985        param = self.get_param()
8986
8987        # Threads
8988        threads = self.get_threads()
8989
8990        # Prefix
8991        prefix = self.get_explode_infos_prefix()
8992
8993        # Header
8994        vcf_reader = self.get_header()
8995
8996        # Added columns
8997        added_columns = []
8998
8999        # Get HGVS field
9000        hgvs_field = (
9001            param.get("calculation", {})
9002            .get("calculations", {})
9003            .get("NOMEN", {})
9004            .get("options", {})
9005            .get("hgvs_field", "hgvs")
9006        )
9007
9008        # Get NOMEN pattern
9009        nomen_pattern = (
9010            param.get("calculation", {})
9011            .get("calculations", {})
9012            .get("NOMEN", {})
9013            .get("options", {})
9014            .get("pattern", None)
9015        )
9016
9017        # transcripts list of preference sources
9018        transcripts_sources = {}
9019
9020        # Get transcripts
9021        transcripts_file = (
9022            param.get("calculation", {})
9023            .get("calculations", {})
9024            .get("NOMEN", {})
9025            .get("options", {})
9026            .get("transcripts", None)
9027        )
9028        transcripts_file = full_path(transcripts_file)
9029        if transcripts_file:
9030            if os.path.exists(transcripts_file):
9031                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
9032                transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist()
9033                transcripts_sources["file"] = transcripts_from_file
9034            else:
9035                msg_err = f"Transcript file '{transcripts_file}' does NOT exist"
9036                log.error(msg_err)
9037                raise ValueError(msg_err)
9038
9039        # Get transcripts table
9040        transcripts_table = (
9041            param.get("calculation", {})
9042            .get("calculations", {})
9043            .get("NOMEN", {})
9044            .get("options", {})
9045            .get("transcripts_table", self.get_table_variants())
9046        )
9047        # Get transcripts column
9048        transcripts_column = (
9049            param.get("calculation", {})
9050            .get("calculations", {})
9051            .get("NOMEN", {})
9052            .get("options", {})
9053            .get("transcripts_column", None)
9054        )
9055
9056        if transcripts_table and transcripts_column:
9057            extra_field_transcript = f"{transcripts_table}.{transcripts_column}"
9058            # Explode if not exists
9059            added_columns += self.explode_infos(
9060                fields=[transcripts_column], table=transcripts_table
9061            )
9062        else:
9063            extra_field_transcript = f"NULL"
9064
9065        # Transcripts of preference source order
9066        transcripts_order = (
9067            param.get("calculation", {})
9068            .get("calculations", {})
9069            .get("NOMEN", {})
9070            .get("options", {})
9071            .get("transcripts_order", ["column", "file"])
9072        )
9073
9074        # Transcripts from file
9075        transcripts = transcripts_sources.get("file", [])
9076
9077        # Explode HGVS field in column
9078        added_columns += self.explode_infos(fields=[hgvs_field])
9079
9080        # extra infos
9081        extra_infos = self.get_extra_infos()
9082        extra_field = prefix + hgvs_field
9083
9084        if extra_field in extra_infos:
9085
9086            # Create dataframe
9087            dataframe_hgvs = self.get_query_to_df(
9088                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """
9089            )
9090
9091            # Transcripts rank
9092            transcripts_rank = {
9093                transcript: rank for rank, transcript in enumerate(transcripts, start=1)
9094            }
9095            transcripts_len = len(transcripts_rank)
9096
9097            # Create main NOMEN column
9098            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply(
9099                lambda x: find_nomen(
9100                    hgvs=x.hgvs,
9101                    transcript=x.transcript,
9102                    transcripts=transcripts_rank,
9103                    pattern=nomen_pattern,
9104                    transcripts_source_order=transcripts_order,
9105                    transcripts_len=transcripts_len,
9106                ),
9107                axis=1,
9108            )
9109
9110            # Explode NOMEN Structure and create SQL set for update
9111            sql_nomen_fields = []
9112            for nomen_field in nomen_dict:
9113
9114                # Create VCF header field
9115                vcf_reader.infos[nomen_field] = vcf.parser._Info(
9116                    nomen_field,
9117                    ".",
9118                    "String",
9119                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
9120                    "howard calculation",
9121                    "0",
9122                    self.code_type_map.get("String"),
9123                )
9124
9125                # Add field to SQL query update
9126                sql_nomen_fields.append(
9127                    f"""
9128                        CASE 
9129                            WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('')
9130                            THEN concat(
9131                                    ';{nomen_field}=',
9132                                    dataframe_hgvs."{field_nomen_dict}"."{nomen_field}"
9133                                )
9134                            ELSE ''
9135                        END
9136                    """
9137                )
9138
9139            # SQL set for update
9140            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
9141
9142            # Update
9143            sql_update = f"""
9144                UPDATE variants
9145                SET "INFO" = 
9146                    concat(
9147                        CASE
9148                            WHEN "INFO" IS NULL
9149                            THEN ''
9150                            ELSE "INFO"
9151                        END,
9152                        {sql_nomen_fields_set}
9153                    )
9154                FROM dataframe_hgvs
9155                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
9156                    AND variants."POS" = dataframe_hgvs."POS" 
9157                    AND variants."REF" = dataframe_hgvs."REF"
9158                    AND variants."ALT" = dataframe_hgvs."ALT"
9159            """
9160            self.conn.execute(sql_update)
9161
9162            # Delete dataframe
9163            del dataframe_hgvs
9164            gc.collect()
9165
9166        # Remove added columns
9167        for added_column in added_columns:
9168            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
9170    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
9171        """
9172        The function `calculation_find_by_pipeline` performs a calculation to find the number of
9173        pipeline/sample for a variant and updates the variant information in a VCF file.
9174
9175        :param tag: The `tag` parameter is a string that represents the annotation field for the
9176        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
9177        VCF header and to update the corresponding field in the variants table, defaults to
9178        findbypipeline
9179        :type tag: str (optional)
9180        """
9181
9182        # if FORMAT and samples
9183        if (
9184            "FORMAT" in self.get_header_columns_as_list()
9185            and self.get_header_sample_list()
9186        ):
9187
9188            # findbypipeline annotation field
9189            findbypipeline_tag = tag
9190
9191            # VCF infos tags
9192            vcf_infos_tags = {
9193                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
9194            }
9195
9196            # Prefix
9197            prefix = self.get_explode_infos_prefix()
9198
9199            # Field
9200            findbypipeline_infos = prefix + findbypipeline_tag
9201
9202            # Variants table
9203            table_variants = self.get_table_variants()
9204
9205            # Header
9206            vcf_reader = self.get_header()
9207
9208            # Create variant id
9209            variant_id_column = self.get_variant_id_column()
9210            added_columns = [variant_id_column]
9211
9212            # variant_id, FORMAT and samples
9213            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9214                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9215            )
9216
9217            # Create dataframe
9218            dataframe_findbypipeline = self.get_query_to_df(
9219                f""" SELECT {samples_fields} FROM {table_variants} """
9220            )
9221
9222            # Create findbypipeline column
9223            dataframe_findbypipeline[findbypipeline_infos] = (
9224                dataframe_findbypipeline.apply(
9225                    lambda row: findbypipeline(
9226                        row, samples=self.get_header_sample_list()
9227                    ),
9228                    axis=1,
9229                )
9230            )
9231
9232            # Add snpeff_hgvs to header
9233            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
9234                findbypipeline_tag,
9235                ".",
9236                "String",
9237                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
9238                "howard calculation",
9239                "0",
9240                self.code_type_map.get("String"),
9241            )
9242
9243            # Update
9244            sql_update = f"""
9245                UPDATE variants
9246                SET "INFO" = 
9247                    concat(
9248                        CASE
9249                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9250                            THEN ''
9251                            ELSE concat("INFO", ';')
9252                        END,
9253                        CASE 
9254                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
9255                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
9256                            THEN concat(
9257                                    '{findbypipeline_tag}=',
9258                                    dataframe_findbypipeline."{findbypipeline_infos}"
9259                                )
9260                            ELSE ''
9261                        END
9262                    )
9263                FROM dataframe_findbypipeline
9264                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
9265            """
9266            self.conn.execute(sql_update)
9267
9268            # Remove added columns
9269            for added_column in added_columns:
9270                self.drop_column(column=added_column)
9271
9272            # Delete dataframe
9273            del dataframe_findbypipeline
9274            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
9276    def calculation_genotype_concordance(self) -> None:
9277        """
9278        The function `calculation_genotype_concordance` calculates the genotype concordance for
9279        multi-caller VCF files and updates the variant information in the database.
9280        """
9281
9282        # if FORMAT and samples
9283        if (
9284            "FORMAT" in self.get_header_columns_as_list()
9285            and self.get_header_sample_list()
9286        ):
9287
9288            # genotypeconcordance annotation field
9289            genotypeconcordance_tag = "genotypeconcordance"
9290
9291            # VCF infos tags
9292            vcf_infos_tags = {
9293                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
9294            }
9295
9296            # Prefix
9297            prefix = self.get_explode_infos_prefix()
9298
9299            # Field
9300            genotypeconcordance_infos = prefix + genotypeconcordance_tag
9301
9302            # Variants table
9303            table_variants = self.get_table_variants()
9304
9305            # Header
9306            vcf_reader = self.get_header()
9307
9308            # Create variant id
9309            variant_id_column = self.get_variant_id_column()
9310            added_columns = [variant_id_column]
9311
9312            # variant_id, FORMAT and samples
9313            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9314                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9315            )
9316
9317            # Create dataframe
9318            dataframe_genotypeconcordance = self.get_query_to_df(
9319                f""" SELECT {samples_fields} FROM {table_variants} """
9320            )
9321
9322            # Create genotypeconcordance column
9323            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
9324                dataframe_genotypeconcordance.apply(
9325                    lambda row: genotypeconcordance(
9326                        row, samples=self.get_header_sample_list()
9327                    ),
9328                    axis=1,
9329                )
9330            )
9331
9332            # Add genotypeconcordance to header
9333            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
9334                genotypeconcordance_tag,
9335                ".",
9336                "String",
9337                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
9338                "howard calculation",
9339                "0",
9340                self.code_type_map.get("String"),
9341            )
9342
9343            # Update
9344            sql_update = f"""
9345                UPDATE variants
9346                SET "INFO" = 
9347                    concat(
9348                        CASE
9349                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9350                            THEN ''
9351                            ELSE concat("INFO", ';')
9352                        END,
9353                        CASE
9354                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
9355                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
9356                            THEN concat(
9357                                    '{genotypeconcordance_tag}=',
9358                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
9359                                )
9360                            ELSE ''
9361                        END
9362                    )
9363                FROM dataframe_genotypeconcordance
9364                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
9365            """
9366            self.conn.execute(sql_update)
9367
9368            # Remove added columns
9369            for added_column in added_columns:
9370                self.drop_column(column=added_column)
9371
9372            # Delete dataframe
9373            del dataframe_genotypeconcordance
9374            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
9376    def calculation_barcode(self, tag: str = "barcode") -> None:
9377        """
9378        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
9379        updates the INFO field in the file with the calculated barcode values.
9380
9381        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
9382        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
9383        the default tag name is set to "barcode", defaults to barcode
9384        :type tag: str (optional)
9385        """
9386
9387        # if FORMAT and samples
9388        if (
9389            "FORMAT" in self.get_header_columns_as_list()
9390            and self.get_header_sample_list()
9391        ):
9392
9393            # barcode annotation field
9394            if not tag:
9395                tag = "barcode"
9396
9397            # VCF infos tags
9398            vcf_infos_tags = {
9399                tag: "barcode calculation (VaRank)",
9400            }
9401
9402            # Prefix
9403            prefix = self.get_explode_infos_prefix()
9404
9405            # Field
9406            barcode_infos = prefix + tag
9407
9408            # Variants table
9409            table_variants = self.get_table_variants()
9410
9411            # Header
9412            vcf_reader = self.get_header()
9413
9414            # Create variant id
9415            variant_id_column = self.get_variant_id_column()
9416            added_columns = [variant_id_column]
9417
9418            # variant_id, FORMAT and samples
9419            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9420                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9421            )
9422
9423            # Create dataframe
9424            dataframe_barcode = self.get_query_to_df(
9425                f""" SELECT {samples_fields} FROM {table_variants} """
9426            )
9427
9428            # Create barcode column
9429            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9430                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
9431            )
9432
9433            # Add barcode to header
9434            vcf_reader.infos[tag] = vcf.parser._Info(
9435                tag,
9436                ".",
9437                "String",
9438                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
9439                "howard calculation",
9440                "0",
9441                self.code_type_map.get("String"),
9442            )
9443
9444            # Update
9445            sql_update = f"""
9446                UPDATE {table_variants}
9447                SET "INFO" = 
9448                    concat(
9449                        CASE
9450                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9451                            THEN ''
9452                            ELSE concat("INFO", ';')
9453                        END,
9454                        CASE
9455                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
9456                            AND dataframe_barcode."{barcode_infos}" NOT NULL
9457                            THEN concat(
9458                                    '{tag}=',
9459                                    dataframe_barcode."{barcode_infos}"
9460                                )
9461                            ELSE ''
9462                        END
9463                    )
9464                FROM dataframe_barcode
9465                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9466            """
9467            self.conn.execute(sql_update)
9468
9469            # Remove added columns
9470            for added_column in added_columns:
9471                self.drop_column(column=added_column)
9472
9473            # Delete dataframe
9474            del dataframe_barcode
9475            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
9477    def calculation_barcode_family(self, tag: str = "BCF") -> None:
9478        """
9479        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
9480        and updates the INFO field in the file with the calculated barcode values.
9481
9482        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
9483        the barcode tag that will be added to the VCF file during the calculation process. If no value
9484        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
9485        :type tag: str (optional)
9486        """
9487
9488        # if FORMAT and samples
9489        if (
9490            "FORMAT" in self.get_header_columns_as_list()
9491            and self.get_header_sample_list()
9492        ):
9493
9494            # barcode annotation field
9495            if not tag:
9496                tag = "BCF"
9497
9498            # VCF infos tags
9499            vcf_infos_tags = {
9500                tag: "barcode family calculation",
9501                f"{tag}S": "barcode family samples",
9502            }
9503
9504            # Param
9505            param = self.get_param()
9506            log.debug(f"param={param}")
9507
9508            # Prefix
9509            prefix = self.get_explode_infos_prefix()
9510
9511            # PED param
9512            ped = (
9513                param.get("calculation", {})
9514                .get("calculations", {})
9515                .get("BARCODEFAMILY", {})
9516                .get("family_pedigree", None)
9517            )
9518            log.debug(f"ped={ped}")
9519
9520            # Load PED
9521            if ped:
9522
9523                # Pedigree is a file
9524                if isinstance(ped, str) and os.path.exists(full_path(ped)):
9525                    log.debug("Pedigree is file")
9526                    with open(full_path(ped)) as ped:
9527                        ped = yaml.safe_load(ped)
9528
9529                # Pedigree is a string
9530                elif isinstance(ped, str):
9531                    log.debug("Pedigree is str")
9532                    try:
9533                        ped = json.loads(ped)
9534                        log.debug("Pedigree is json str")
9535                    except ValueError as e:
9536                        ped_samples = ped.split(",")
9537                        ped = {}
9538                        for ped_sample in ped_samples:
9539                            ped[ped_sample] = ped_sample
9540
9541                # Pedigree is a dict
9542                elif isinstance(ped, dict):
9543                    log.debug("Pedigree is dict")
9544
9545                # Pedigree is not well formatted
9546                else:
9547                    msg_error = "Pedigree not well formatted"
9548                    log.error(msg_error)
9549                    raise ValueError(msg_error)
9550
9551                # Construct list
9552                ped_samples = list(ped.values())
9553
9554            else:
9555                log.debug("Pedigree not defined. Take all samples")
9556                ped_samples = self.get_header_sample_list()
9557                ped = {}
9558                for ped_sample in ped_samples:
9559                    ped[ped_sample] = ped_sample
9560
9561            # Check pedigree
9562            if not ped or len(ped) == 0:
9563                msg_error = f"Error in pedigree: samples {ped_samples}"
9564                log.error(msg_error)
9565                raise ValueError(msg_error)
9566
9567            # Log
9568            log.info(
9569                "Calculation 'BARCODEFAMILY' - Samples: "
9570                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9571            )
9572            log.debug(f"ped_samples={ped_samples}")
9573
9574            # Field
9575            barcode_infos = prefix + tag
9576
9577            # Variants table
9578            table_variants = self.get_table_variants()
9579
9580            # Header
9581            vcf_reader = self.get_header()
9582
9583            # Create variant id
9584            variant_id_column = self.get_variant_id_column()
9585            added_columns = [variant_id_column]
9586
9587            # variant_id, FORMAT and samples
9588            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9589                [f""" "{sample}" """ for sample in ped_samples]
9590            )
9591
9592            # Create dataframe
9593            dataframe_barcode = self.get_query_to_df(
9594                f""" SELECT {samples_fields} FROM {table_variants} """
9595            )
9596
9597            # Create barcode column
9598            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9599                lambda row: barcode(row, samples=ped_samples), axis=1
9600            )
9601
9602            # Add barcode family to header
9603            # Add vaf_normalization to header
9604            vcf_reader.formats[tag] = vcf.parser._Format(
9605                id=tag,
9606                num=".",
9607                type="String",
9608                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9609                type_code=self.code_type_map.get("String"),
9610            )
9611            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9612                id=f"{tag}S",
9613                num=".",
9614                type="String",
9615                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9616                type_code=self.code_type_map.get("String"),
9617            )
9618
9619            # Update
9620            # for sample in ped_samples:
9621            sql_update_set = []
9622            for sample in self.get_header_sample_list() + ["FORMAT"]:
9623                if sample in ped_samples:
9624                    value = f'dataframe_barcode."{barcode_infos}"'
9625                    value_samples = (
9626                        "'"
9627                        + ",".join([f""" "{sample}" """ for sample in ped_samples])
9628                        + "'"
9629                    )
9630                    ped_samples
9631                elif sample == "FORMAT":
9632                    value = f"'{tag}'"
9633                    value_samples = f"'{tag}S'"
9634                else:
9635                    value = "'.'"
9636                    value_samples = "'.'"
9637                format_regex = r"[a-zA-Z0-9\s]"
9638                sql_update_set.append(
9639                    f"""
9640                        "{sample}" = 
9641                        concat(
9642                            CASE
9643                                WHEN {table_variants}."{sample}" = './.'
9644                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9645                                ELSE {table_variants}."{sample}"
9646                            END,
9647                            ':',
9648                            {value},
9649                            ':',
9650                            {value_samples}
9651                        )
9652                    """
9653                )
9654
9655            sql_update_set_join = ", ".join(sql_update_set)
9656            sql_update = f"""
9657                UPDATE {table_variants}
9658                SET {sql_update_set_join}
9659                FROM dataframe_barcode
9660                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9661            """
9662            self.conn.execute(sql_update)
9663
9664            # Remove added columns
9665            for added_column in added_columns:
9666                self.drop_column(column=added_column)
9667
9668            # Delete dataframe
9669            del dataframe_barcode
9670            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9672    def calculation_trio(self) -> None:
9673        """
9674        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9675        information to the INFO field of each variant.
9676        """
9677
9678        # if FORMAT and samples
9679        if (
9680            "FORMAT" in self.get_header_columns_as_list()
9681            and self.get_header_sample_list()
9682        ):
9683
9684            # trio annotation field
9685            trio_tag = "trio"
9686
9687            # VCF infos tags
9688            vcf_infos_tags = {
9689                "trio": "trio calculation",
9690            }
9691
9692            # Param
9693            param = self.get_param()
9694
9695            # Prefix
9696            prefix = self.get_explode_infos_prefix()
9697
9698            # Trio param
9699            trio_ped = (
9700                param.get("calculation", {})
9701                .get("calculations", {})
9702                .get("TRIO", {})
9703                .get("trio_pedigree", None)
9704            )
9705
9706            # Load trio
9707            if trio_ped:
9708
9709                # Trio pedigree is a file
9710                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9711                    log.debug("TRIO pedigree is file")
9712                    with open(full_path(trio_ped)) as trio_ped:
9713                        trio_ped = yaml.safe_load(trio_ped)
9714
9715                # Trio pedigree is a string
9716                elif isinstance(trio_ped, str):
9717                    log.debug("TRIO pedigree is str")
9718                    try:
9719                        trio_ped = json.loads(trio_ped)
9720                        log.debug("TRIO pedigree is json str")
9721                    except ValueError as e:
9722                        trio_samples = trio_ped.split(",")
9723                        if len(trio_samples) == 3:
9724                            trio_ped = {
9725                                "father": trio_samples[0],
9726                                "mother": trio_samples[1],
9727                                "child": trio_samples[2],
9728                            }
9729                            log.debug("TRIO pedigree is list str")
9730                        else:
9731                            msg_error = "TRIO pedigree not well formatted"
9732                            log.error(msg_error)
9733                            raise ValueError(msg_error)
9734
9735                # Trio pedigree is a dict
9736                elif isinstance(trio_ped, dict):
9737                    log.debug("TRIO pedigree is dict")
9738
9739                # Trio pedigree is not well formatted
9740                else:
9741                    msg_error = "TRIO pedigree not well formatted"
9742                    log.error(msg_error)
9743                    raise ValueError(msg_error)
9744
9745                # Construct trio list
9746                trio_samples = [
9747                    trio_ped.get("father", ""),
9748                    trio_ped.get("mother", ""),
9749                    trio_ped.get("child", ""),
9750                ]
9751
9752            else:
9753                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9754                samples_list = self.get_header_sample_list()
9755                if len(samples_list) >= 3:
9756                    trio_samples = self.get_header_sample_list()[0:3]
9757                    trio_ped = {
9758                        "father": trio_samples[0],
9759                        "mother": trio_samples[1],
9760                        "child": trio_samples[2],
9761                    }
9762                else:
9763                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9764                    log.error(msg_error)
9765                    raise ValueError(msg_error)
9766
9767            # Check trio pedigree
9768            if not trio_ped or len(trio_ped) != 3:
9769                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9770                log.error(msg_error)
9771                raise ValueError(msg_error)
9772
9773            # Log
9774            log.info(
9775                f"Calculation 'TRIO' - Samples: "
9776                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9777            )
9778
9779            # Field
9780            trio_infos = prefix + trio_tag
9781
9782            # Variants table
9783            table_variants = self.get_table_variants()
9784
9785            # Header
9786            vcf_reader = self.get_header()
9787
9788            # Create variant id
9789            variant_id_column = self.get_variant_id_column()
9790            added_columns = [variant_id_column]
9791
9792            # variant_id, FORMAT and samples
9793            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9794                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
9795            )
9796
9797            # Create dataframe
9798            dataframe_trio = self.get_query_to_df(
9799                f""" SELECT {samples_fields} FROM {table_variants} """
9800            )
9801
9802            # Create trio column
9803            dataframe_trio[trio_infos] = dataframe_trio.apply(
9804                lambda row: trio(row, samples=trio_samples), axis=1
9805            )
9806
9807            # Add trio to header
9808            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9809                trio_tag,
9810                ".",
9811                "String",
9812                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9813                "howard calculation",
9814                "0",
9815                self.code_type_map.get("String"),
9816            )
9817
9818            # Update
9819            sql_update = f"""
9820                UPDATE {table_variants}
9821                SET "INFO" = 
9822                    concat(
9823                        CASE
9824                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9825                            THEN ''
9826                            ELSE concat("INFO", ';')
9827                        END,
9828                        CASE
9829                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9830                             AND dataframe_trio."{trio_infos}" NOT NULL
9831                            THEN concat(
9832                                    '{trio_tag}=',
9833                                    dataframe_trio."{trio_infos}"
9834                                )
9835                            ELSE ''
9836                        END
9837                    )
9838                FROM dataframe_trio
9839                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9840            """
9841            self.conn.execute(sql_update)
9842
9843            # Remove added columns
9844            for added_column in added_columns:
9845                self.drop_column(column=added_column)
9846
9847            # Delete dataframe
9848            del dataframe_trio
9849            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9851    def calculation_vaf_normalization(self) -> None:
9852        """
9853        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9854        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9855        :return: The function does not return anything.
9856        """
9857
9858        # if FORMAT and samples
9859        if (
9860            "FORMAT" in self.get_header_columns_as_list()
9861            and self.get_header_sample_list()
9862        ):
9863
9864            # vaf_normalization annotation field
9865            vaf_normalization_tag = "VAF"
9866
9867            # VCF infos tags
9868            vcf_infos_tags = {
9869                "VAF": "VAF Variant Frequency",
9870            }
9871
9872            # Prefix
9873            prefix = self.get_explode_infos_prefix()
9874
9875            # Variants table
9876            table_variants = self.get_table_variants()
9877
9878            # Header
9879            vcf_reader = self.get_header()
9880
9881            # Do not calculate if VAF already exists
9882            if "VAF" in vcf_reader.formats:
9883                log.debug("VAF already on genotypes")
9884                return
9885
9886            # Create variant id
9887            variant_id_column = self.get_variant_id_column()
9888            added_columns = [variant_id_column]
9889
9890            # variant_id, FORMAT and samples
9891            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9892                f""" "{sample}" """ for sample in self.get_header_sample_list()
9893            )
9894
9895            # Create dataframe
9896            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9897            log.debug(f"query={query}")
9898            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9899
9900            vaf_normalization_set = []
9901
9902            # for each sample vaf_normalization
9903            for sample in self.get_header_sample_list():
9904                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9905                    lambda row: vaf_normalization(row, sample=sample), axis=1
9906                )
9907                vaf_normalization_set.append(
9908                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9909                )
9910
9911            # Add VAF to FORMAT
9912            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9913                "FORMAT"
9914            ].apply(lambda x: str(x) + ":VAF")
9915            vaf_normalization_set.append(
9916                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9917            )
9918
9919            # Add vaf_normalization to header
9920            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9921                id=vaf_normalization_tag,
9922                num="1",
9923                type="Float",
9924                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9925                type_code=self.code_type_map.get("Float"),
9926            )
9927
9928            # Create fields to add in INFO
9929            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9930
9931            # Update
9932            sql_update = f"""
9933                UPDATE {table_variants}
9934                SET {sql_vaf_normalization_set}
9935                FROM dataframe_vaf_normalization
9936                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9937
9938            """
9939            self.conn.execute(sql_update)
9940
9941            # Remove added columns
9942            for added_column in added_columns:
9943                self.drop_column(column=added_column)
9944
9945            # Delete dataframe
9946            del dataframe_vaf_normalization
9947            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
 9949    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9950        """
 9951        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9952        field in a VCF file and updates the INFO column of the variants table with the calculated
 9953        statistics.
 9954
 9955        :param info: The `info` parameter is a string that represents the type of information for which
 9956        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9957        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9958        maximum value, the mean, the median, defaults to VAF
 9959        :type info: str (optional)
 9960        """
 9961
 9962        # if FORMAT and samples
 9963        if (
 9964            "FORMAT" in self.get_header_columns_as_list()
 9965            and self.get_header_sample_list()
 9966        ):
 9967
 9968            # vaf_stats annotation field
 9969            vaf_stats_tag = info + "_stats"
 9970
 9971            # VCF infos tags
 9972            vcf_infos_tags = {
 9973                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9974                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9975                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9976                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9977                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9978                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9979                info
 9980                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9981            }
 9982
 9983            # Prefix
 9984            prefix = self.get_explode_infos_prefix()
 9985
 9986            # Field
 9987            vaf_stats_infos = prefix + vaf_stats_tag
 9988
 9989            # Variants table
 9990            table_variants = self.get_table_variants()
 9991
 9992            # Header
 9993            vcf_reader = self.get_header()
 9994
 9995            # Create variant id
 9996            variant_id_column = self.get_variant_id_column()
 9997            added_columns = [variant_id_column]
 9998
 9999            # variant_id, FORMAT and samples
10000            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
10001                [f""" "{sample}" """ for sample in self.get_header_sample_list()]
10002            )
10003
10004            # Create dataframe
10005            dataframe_vaf_stats = self.get_query_to_df(
10006                f""" SELECT {samples_fields} FROM {table_variants} """
10007            )
10008
10009            # Create vaf_stats column
10010            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
10011                lambda row: genotype_stats(
10012                    row, samples=self.get_header_sample_list(), info=info
10013                ),
10014                axis=1,
10015            )
10016
10017            # List of vcf tags
10018            sql_vaf_stats_fields = []
10019
10020            # Check all VAF stats infos
10021            for stat in vcf_infos_tags:
10022
10023                # Extract stats
10024                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
10025                    lambda x: dict(x).get(stat, "")
10026                )
10027
10028                # Add snpeff_hgvs to header
10029                vcf_reader.infos[stat] = vcf.parser._Info(
10030                    stat,
10031                    ".",
10032                    "String",
10033                    vcf_infos_tags.get(stat, "genotype statistics"),
10034                    "howard calculation",
10035                    "0",
10036                    self.code_type_map.get("String"),
10037                )
10038
10039                if len(sql_vaf_stats_fields):
10040                    sep = ";"
10041                else:
10042                    sep = ""
10043
10044                # Create fields to add in INFO
10045                sql_vaf_stats_fields.append(
10046                    f"""
10047                        CASE
10048                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
10049                            THEN concat(
10050                                    '{sep}{stat}=',
10051                                    dataframe_vaf_stats."{stat}"
10052                                )
10053                            ELSE ''
10054                        END
10055                    """
10056                )
10057
10058            # SQL set for update
10059            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
10060
10061            # Update
10062            sql_update = f"""
10063                UPDATE {table_variants}
10064                SET "INFO" = 
10065                    concat(
10066                        CASE
10067                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10068                            THEN ''
10069                            ELSE concat("INFO", ';')
10070                        END,
10071                        {sql_vaf_stats_fields_set}
10072                    )
10073                FROM dataframe_vaf_stats
10074                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
10075
10076            """
10077            self.conn.execute(sql_update)
10078
10079            # Remove added columns
10080            for added_column in added_columns:
10081                self.drop_column(column=added_column)
10082
10083            # Delete dataframe
10084            del dataframe_vaf_stats
10085            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
10087    def calculation_transcripts_annotation(
10088        self, info_json: str = None, info_format: str = None
10089    ) -> None:
10090        """
10091        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
10092        field to it if transcripts are available.
10093
10094        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
10095        is a string parameter that represents the information field to be used in the transcripts JSON.
10096        It is used to specify the JSON format for the transcripts information. If no value is provided
10097        when calling the method, it defaults to "
10098        :type info_json: str
10099        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
10100        method is a string parameter that specifies the format of the information field to be used in
10101        the transcripts JSON. It is used to define the format of the information field
10102        :type info_format: str
10103        """
10104
10105        # Create transcripts table
10106        transcripts_table = self.create_transcript_view()
10107
10108        # Add info field
10109        if transcripts_table:
10110            self.transcript_view_to_variants(
10111                transcripts_table=transcripts_table,
10112                transcripts_info_field_json=info_json,
10113                transcripts_info_field_format=info_format,
10114            )
10115        else:
10116            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
10118    def calculation_transcripts_prioritization(self) -> None:
10119        """
10120        The function `calculation_transcripts_prioritization` creates a transcripts table and
10121        prioritizes transcripts based on certain criteria.
10122        """
10123
10124        # Create transcripts table
10125        transcripts_table = self.create_transcript_view()
10126
10127        # Add info field
10128        if transcripts_table:
10129            self.transcripts_prioritization(transcripts_table=transcripts_table)
10130        else:
10131            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def calculation_transcripts_export(self) -> None:
10133    def calculation_transcripts_export(self) -> None:
10134        """ """
10135
10136        # Create transcripts table
10137        transcripts_table = self.create_transcript_view()
10138
10139        # Add info field
10140        if transcripts_table:
10141            self.transcripts_export(transcripts_table=transcripts_table)
10142        else:
10143            log.info("No Transcripts to process. Check param.json file configuration")
def transcripts_export(self, transcripts_table: str = None, param: dict = {}) -> bool:
10149    def transcripts_export(
10150        self, transcripts_table: str = None, param: dict = {}
10151    ) -> bool:
10152        """ """
10153
10154        log.debug("Start transcripts export...")
10155
10156        # Param
10157        if not param:
10158            param = self.get_param()
10159
10160        # Param export
10161        param_transcript_export = param.get("transcripts", {}).get("export", {})
10162
10163        # Output file
10164        transcripts_export_output = param_transcript_export.get("output", None)
10165
10166        if not param_transcript_export or not transcripts_export_output:
10167            log.warning(f"No transcriipts export parameters defined!")
10168            return False
10169
10170        # List of transcripts annotations
10171        query_describe = f"""
10172            SELECT column_name
10173            FROM (
10174                    DESCRIBE SELECT * FROM {transcripts_table}
10175                )
10176            WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
10177        """
10178        transcripts_annotations_list = list(
10179            self.get_query_to_df(query=query_describe)["column_name"]
10180        )
10181
10182        # Create transcripts table for export
10183        transcripts_table_export = f"{transcripts_table}_export_" + "".join(
10184            random.choices(string.ascii_uppercase + string.digits, k=10)
10185        )
10186        query_create_transcripts_table_export = f"""
10187            CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
10188        """
10189        self.execute_query(query=query_create_transcripts_table_export)
10190
10191        # Output file format
10192        transcripts_export_output_format = get_file_format(
10193            filename=transcripts_export_output
10194        )
10195
10196        # Format VCF - construct INFO
10197        if transcripts_export_output_format in ["vcf"]:
10198
10199            # Construct query update INFO and header
10200            query_update_info = []
10201            for field in transcripts_annotations_list:
10202
10203                # If field not in header
10204                if field not in self.get_header_infos_list():
10205
10206                    # Add PZ Transcript in header
10207                    self.get_header().infos[field] = vcf.parser._Info(
10208                        field,
10209                        ".",
10210                        "String",
10211                        f"Annotation '{field}' from transcript view",
10212                        "unknown",
10213                        "unknown",
10214                        0,
10215                    )
10216
10217                # Add field as INFO/tag
10218                query_update_info.append(
10219                    f"""
10220                        CASE
10221                            WHEN "{field}" IS NOT NULL
10222                            THEN concat('{field}=', "{field}", ';')    
10223                            ELSE ''     
10224                        END
10225                        """
10226                )
10227
10228            # Query param
10229            query_update_info_value = (
10230                f""" concat('',  {", ".join(query_update_info)}) """
10231            )
10232            query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """
10233
10234        else:
10235
10236            # Query param
10237            query_update_info_value = f""" NULL """
10238            query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """
10239
10240        # Update query INFO column
10241        query_update = f"""
10242            UPDATE {transcripts_table_export}
10243            SET INFO = {query_update_info_value}
10244
10245        """
10246        self.execute_query(query=query_update)
10247
10248        # Export
10249        self.export_output(
10250            output_file=transcripts_export_output,
10251            query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
10252        )
10253
10254        # Drop transcripts export table
10255        query_drop_transcripts_table_export = f"""
10256            DROP TABLE {transcripts_table_export}
10257        """
10258        self.execute_query(query=query_drop_transcripts_table_export)
def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
10260    def transcripts_prioritization(
10261        self, transcripts_table: str = None, param: dict = {}
10262    ) -> bool:
10263        """
10264        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
10265        and updates the variants table with the prioritized information.
10266
10267        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10268        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
10269        This parameter is used to identify the table where the transcripts data is stored for the
10270        prioritization process
10271        :type transcripts_table: str
10272        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
10273        that contains various configuration settings for the prioritization process of transcripts. It
10274        is used to customize the behavior of the prioritization algorithm and includes settings such as
10275        the prefix for prioritization fields, default profiles, and other
10276        :type param: dict
10277        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
10278        transcripts prioritization process is successfully completed, and `False` if there are any
10279        issues or if no profile is defined for transcripts prioritization.
10280        """
10281
10282        log.debug("Start transcripts prioritization...")
10283
10284        # Param
10285        if not param:
10286            param = self.get_param()
10287
10288        # Variants table
10289        table_variants = self.get_table_variants()
10290
10291        # Transcripts table
10292        if transcripts_table is None:
10293            transcripts_table = self.create_transcript_view(
10294                transcripts_table="transcripts", param=param
10295            )
10296        if transcripts_table is None:
10297            msg_err = "No Transcripts table availalble"
10298            log.error(msg_err)
10299            raise ValueError(msg_err)
10300        log.debug(f"transcripts_table={transcripts_table}")
10301
10302        # Get transcripts columns
10303        columns_as_list_query = f"""
10304            DESCRIBE {transcripts_table}
10305        """
10306        columns_as_list = list(
10307            self.get_query_to_df(columns_as_list_query)["column_name"]
10308        )
10309
10310        # Create INFO if not exists
10311        if "INFO" not in columns_as_list:
10312            query_add_info = f"""
10313                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
10314            """
10315            self.execute_query(query_add_info)
10316
10317        # Prioritization param and Force only PZ Score and Flag
10318        pz_param = param.get("transcripts", {}).get("prioritization", {})
10319
10320        # PZ profile by default
10321        pz_profile_default = (
10322            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
10323        )
10324
10325        # Exit if no profile
10326        if pz_profile_default is None:
10327            log.warning("No profile defined for transcripts prioritization")
10328            return False
10329
10330        # PZ fields
10331        pz_param_pzfields = {}
10332
10333        # PZ field transcripts
10334        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
10335
10336        # Add PZ Transcript in header
10337        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
10338            pz_fields_transcripts,
10339            ".",
10340            "String",
10341            f"Transcript selected from prioritization process, profile {pz_profile_default}",
10342            "unknown",
10343            "unknown",
10344            code_type_map["String"],
10345        )
10346
10347        # Mandatory fields if asked in param
10348        pz_mandatory_fields_list = [
10349            "Score",
10350            "Flag",
10351            "Tags",
10352            "Comment",
10353            "Infos",
10354            "Class",
10355        ]
10356        pz_mandatory_fields = []
10357        for pz_mandatory_field in pz_mandatory_fields_list:
10358            pz_mandatory_fields.append(
10359                pz_param.get("pzprefix", "PTZ") + pz_mandatory_field
10360            )
10361
10362        # PZ fields in param
10363        pz_param_mandatory_fields = []
10364        for pz_field in pz_param.get("pzfields", []):
10365            if pz_field in pz_mandatory_fields_list:
10366                pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = (
10367                    pz_param.get("pzprefix", "PTZ") + pz_field
10368                )
10369                pz_param_mandatory_fields.append(
10370                    pz_param.get("pzprefix", "PTZ") + pz_field
10371                )
10372            else:
10373                pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field
10374                pz_param_pzfields[pz_field] = pz_field_new
10375
10376                # Add PZ Transcript in header
10377                self.get_header().infos[pz_field_new] = vcf.parser._Info(
10378                    pz_field_new,
10379                    ".",
10380                    "String",
10381                    f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}",
10382                    "unknown",
10383                    "unknown",
10384                    code_type_map["String"],
10385                )
10386
10387        # PZ fields param
10388        pz_mandatory_fields = pz_param_mandatory_fields
10389        pz_param["pzfields"] = pz_mandatory_fields
10390
10391        # Prioritization
10392        prioritization_result = self.prioritization(
10393            table=transcripts_table,
10394            pz_param=param.get("transcripts", {}).get("prioritization", {}),
10395        )
10396        if not prioritization_result:
10397            log.warning("Transcripts prioritization not processed")
10398            return False
10399
10400        # PZ fields sql query
10401        query_update_select_list = []
10402        query_update_concat_list = []
10403        query_update_order_list = []
10404        for pz_param_pzfield in set(
10405            list(pz_param_pzfields.keys()) + pz_mandatory_fields
10406        ):
10407            query_update_select_list.append(f" {pz_param_pzfield}, ")
10408
10409        for pz_param_pzfield in pz_param_pzfields:
10410            query_update_concat_list.append(
10411                f"""
10412                    , CASE 
10413                        WHEN {pz_param_pzfield} IS NOT NULL
10414                        THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield})
10415                        ELSE ''
10416                    END
10417                """
10418            )
10419
10420        # Order by
10421        pz_orders = (
10422            param.get("transcripts", {})
10423            .get("prioritization", {})
10424            .get("prioritization_transcripts_order", {})
10425        )
10426        if not pz_orders:
10427            pz_orders = {
10428                pz_param.get("pzprefix", "PTZ") + "Flag": "DESC",
10429                pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
10430            }
10431        for pz_order in pz_orders:
10432            query_update_order_list.append(
10433                f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
10434            )
10435
10436        # Fields to explode
10437        fields_to_explode = (
10438            list(pz_param_pzfields.keys())
10439            + pz_mandatory_fields
10440            + list(pz_orders.keys())
10441        )
10442        # Remove transcript column as a specific transcript column
10443        if "transcript" in fields_to_explode:
10444            fields_to_explode.remove("transcript")
10445
10446        # Fields intranscripts table
10447        query_transcripts_table = f"""
10448            DESCRIBE SELECT * FROM {transcripts_table}
10449        """
10450        query_transcripts_table = self.get_query_to_df(query=query_transcripts_table)
10451
10452        # Check fields to explode
10453        for field_to_explode in fields_to_explode:
10454            if field_to_explode not in self.get_header_infos_list() + list(
10455                query_transcripts_table.column_name
10456            ):
10457                msg_err = f"INFO/{field_to_explode} NOT IN header"
10458                log.error(msg_err)
10459                raise ValueError(msg_err)
10460
10461        # Explode fields to explode
10462        self.explode_infos(
10463            table=transcripts_table,
10464            fields=fields_to_explode,
10465        )
10466
10467        # Transcript preference file
10468        transcripts_preference_file = (
10469            param.get("transcripts", {})
10470            .get("prioritization", {})
10471            .get("prioritization_transcripts", {})
10472        )
10473        transcripts_preference_file = full_path(transcripts_preference_file)
10474
10475        # Transcript preference forced
10476        transcript_preference_force = (
10477            param.get("transcripts", {})
10478            .get("prioritization", {})
10479            .get("prioritization_transcripts_force", False)
10480        )
10481        # Transcript version forced
10482        transcript_version_force = (
10483            param.get("transcripts", {})
10484            .get("prioritization", {})
10485            .get("prioritization_transcripts_version_force", False)
10486        )
10487
10488        # Transcripts Ranking
10489        if transcripts_preference_file:
10490
10491            # Transcripts file to dataframe
10492            if os.path.exists(transcripts_preference_file):
10493                transcripts_preference_dataframe = transcripts_file_to_df(
10494                    transcripts_preference_file
10495                )
10496            else:
10497                log.error(
10498                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10499                )
10500                raise ValueError(
10501                    f"Transcript file '{transcripts_preference_file}' does NOT exist"
10502                )
10503
10504            # Order by depending to transcript preference forcing
10505            if transcript_preference_force:
10506                order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
10507            else:
10508                order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """
10509
10510            # Transcript columns joined depend on version consideration
10511            if transcript_version_force:
10512                transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
10513            else:
10514                transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """
10515
10516            # Query ranking for update
10517            query_update_ranking = f"""
10518                SELECT
10519                    "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
10520                    ROW_NUMBER() OVER (
10521                        PARTITION BY "#CHROM", POS, REF, ALT
10522                        ORDER BY {order_by}
10523                    ) AS rn
10524                FROM {transcripts_table}
10525                LEFT JOIN 
10526                    (
10527                        SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
10528                        FROM transcripts_preference_dataframe
10529                    ) AS transcripts_preference
10530                ON {transcripts_version_join}
10531            """
10532
10533        else:
10534
10535            # Query ranking for update
10536            query_update_ranking = f"""
10537                SELECT
10538                    "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
10539                    ROW_NUMBER() OVER (
10540                        PARTITION BY "#CHROM", POS, REF, ALT
10541                        ORDER BY {" , ".join(query_update_order_list)}
10542                    ) AS rn
10543                FROM {transcripts_table}
10544            """
10545
10546        # Export Transcripts prioritization infos to variants table
10547        query_update = f"""
10548            WITH RankedTranscripts AS (
10549                {query_update_ranking}
10550            )
10551            UPDATE {table_variants}
10552                SET
10553                INFO = CONCAT(CASE
10554                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
10555                            THEN ''
10556                            ELSE concat("INFO", ';')
10557                        END,
10558                        concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)})
10559                        )
10560            FROM
10561                RankedTranscripts
10562            WHERE
10563                rn = 1
10564                AND variants."#CHROM" = RankedTranscripts."#CHROM"
10565                AND variants."POS" = RankedTranscripts."POS"
10566                AND variants."REF" = RankedTranscripts."REF"
10567                AND variants."ALT" = RankedTranscripts."ALT"     
10568        """
10569
10570        # log.debug(f"query_update={query_update}")
10571        self.execute_query(query=query_update)
10572
10573        # Return
10574        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10576    def create_transcript_view_from_columns_map(
10577        self,
10578        transcripts_table: str = "transcripts",
10579        columns_maps: dict = {},
10580        added_columns: list = [],
10581        temporary_tables: list = None,
10582        annotation_fields: list = None,
10583        column_rename: dict = {},
10584        column_clean: bool = False,
10585        column_case: str = None,
10586    ) -> tuple[list, list, list]:
10587        """
10588        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
10589        specified columns mapping for transcripts data.
10590
10591        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10592        of the table where the transcripts data is stored or will be stored in the database. This table
10593        typically contains information about transcripts such as Ensembl transcript IDs, gene names,
10594        scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
10595        :type transcripts_table: str (optional)
10596        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information
10597        about how to map columns from a transcripts table to create a view. Each entry in the
10598        `columns_maps` list represents a mapping configuration for a specific set of columns. It
10599        typically includes details such as the main transcript column and additional information columns
10600        :type columns_maps: dict
10601        :param added_columns: The `added_columns` parameter in the
10602        `create_transcript_view_from_columns_map` function is a list that stores the additional columns
10603        that will be added to the view being created based on the columns map provided. These columns
10604        are generated by exploding the transcript information columns along with the main transcript
10605        column
10606        :type added_columns: list
10607        :param temporary_tables: The `temporary_tables` parameter in the
10608        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
10609        tables created during the process of creating a transcript view from a columns map. These
10610        temporary tables are used to store intermediate results or transformations before the final view
10611        is generated
10612        :type temporary_tables: list
10613        :param annotation_fields: The `annotation_fields` parameter in the
10614        `create_transcript_view_from_columns_map` function is a list that stores the fields that are
10615        used for annotation in the query view creation process. These fields are extracted from the
10616        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
10617        :type annotation_fields: list
10618        :param column_rename: The `column_rename` parameter in the
10619        `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify
10620        custom renaming for columns during the creation of the temporary table view. This parameter
10621        provides a mapping of original column names to the desired renamed column names. By using this
10622        parameter,
10623        :type column_rename: dict
10624        :param column_clean: The `column_clean` parameter in the
10625        `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the
10626        column values should be cleaned or not. If set to `True`, the column values will be cleaned by
10627        removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to
10628        False
10629        :type column_clean: bool (optional)
10630        :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map`
10631        function is used to specify the case transformation to be applied to the columns during the view
10632        creation process. It allows you to control whether the column values should be converted to
10633        lowercase, uppercase, or remain unchanged
10634        :type column_case: str
10635        :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three
10636        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
10637        """
10638
10639        log.debug("Start transcrpts view creation from columns map...")
10640
10641        # "from_columns_map": [
10642        #     {
10643        #         "transcripts_column": "Ensembl_transcriptid",
10644        #         "transcripts_infos_columns": [
10645        #             "genename",
10646        #             "Ensembl_geneid",
10647        #             "LIST_S2_score",
10648        #             "LIST_S2_pred",
10649        #         ],
10650        #     },
10651        #     {
10652        #         "transcripts_column": "Ensembl_transcriptid",
10653        #         "transcripts_infos_columns": [
10654        #             "genename",
10655        #             "VARITY_R_score",
10656        #             "Aloft_pred",
10657        #         ],
10658        #     },
10659        # ],
10660
10661        # Init
10662        if temporary_tables is None:
10663            temporary_tables = []
10664        if annotation_fields is None:
10665            annotation_fields = []
10666
10667        # Variants table
10668        table_variants = self.get_table_variants()
10669
10670        for columns_map in columns_maps:
10671
10672            # Log
10673            log.debug(f"columns_map={columns_map}")
10674
10675            # Transcript column
10676            transcripts_column = columns_map.get("transcripts_column", None)
10677
10678            # Transcripts infos columns
10679            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
10680
10681            # Transcripts infos columns rename
10682            column_rename = columns_map.get("column_rename", column_rename)
10683
10684            # Transcripts infos columns clean
10685            column_clean = columns_map.get("column_clean", column_clean)
10686
10687            # Transcripts infos columns case
10688            column_case = columns_map.get("column_case", column_case)
10689
10690            if transcripts_column is not None:
10691
10692                # Explode
10693                added_columns += self.explode_infos(
10694                    fields=[transcripts_column] + transcripts_infos_columns
10695                )
10696
10697                # View clauses
10698                clause_select_variants = []
10699                clause_select_tanscripts = []
10700                for field in [transcripts_column] + transcripts_infos_columns:
10701
10702                    # AS field
10703                    as_field = field
10704
10705                    # Rename
10706                    if column_rename:
10707                        as_field = column_rename.get(as_field, as_field)
10708
10709                    # Clean
10710                    if column_clean:
10711                        as_field = clean_annotation_field(as_field)
10712
10713                    # Case
10714                    if column_case:
10715                        if column_case.lower() in ["lower"]:
10716                            as_field = as_field.lower()
10717                        elif column_case.lower() in ["upper"]:
10718                            as_field = as_field.upper()
10719
10720                    # Clause select Variants
10721                    clause_select_variants.append(
10722                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10723                    )
10724
10725                    if field in [transcripts_column]:
10726                        clause_select_tanscripts.append(
10727                            f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10728                        )
10729                    else:
10730                        clause_select_tanscripts.append(
10731                            f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """
10732                        )
10733                        annotation_fields.append(as_field)
10734
10735                # Query View
10736                query = f""" 
10737                    SELECT
10738                        "#CHROM", POS, REF, ALT, INFO,
10739                        "{transcripts_column}" AS 'transcript',
10740                        {", ".join(clause_select_tanscripts)}
10741                    FROM (
10742                        SELECT 
10743                            "#CHROM", POS, REF, ALT, INFO,
10744                            {", ".join(clause_select_variants)}
10745                        FROM {table_variants}
10746                        )
10747                    WHERE "{transcripts_column}" IS NOT NULL
10748                """
10749
10750                # Create temporary table
10751                temporary_table = transcripts_table + "".join(
10752                    random.choices(string.ascii_uppercase + string.digits, k=10)
10753                )
10754
10755                # Temporary view
10756                temporary_tables.append(temporary_table)
10757                query_view = f"""
10758                    CREATE view {temporary_table}
10759                    AS ({query})
10760                """
10761                self.execute_query(query=query_view)
10762
10763        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
  • column_rename: The column_rename parameter in the create_transcript_view_from_columns_map function is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter,
  • column_clean: The column_clean parameter in the create_transcript_view_from_columns_map function is a boolean flag that determines whether the column values should be cleaned or not. If set to True, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_columns_map function is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns

The create_transcript_view_from_columns_map function returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None, column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> tuple[list, list, list]:
10765    def create_transcript_view_from_column_format(
10766        self,
10767        transcripts_table: str = "transcripts",
10768        column_formats: dict = {},
10769        temporary_tables: list = None,
10770        annotation_fields: list = None,
10771        column_rename: dict = {},
10772        column_clean: bool = False,
10773        column_case: str = None,
10774    ) -> tuple[list, list, list]:
10775        """
10776        The `create_transcript_view_from_column_format` function generates a transcript view based on
10777        specified column formats, adds additional columns and annotation fields, and returns the list of
10778        temporary tables and annotation fields.
10779
10780        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
10781        of the table containing the transcripts data. This table will be used as the base table for
10782        creating the transcript view. The default value for this parameter is "transcripts", but you can
10783        provide a different table name if needed, defaults to transcripts
10784        :type transcripts_table: str (optional)
10785        :param column_formats: The `column_formats` parameter is a dictionary that contains information
10786        about the columns to be used for creating the transcript view. Each entry in the dictionary
10787        specifies the mapping between a transcripts column and a transcripts infos column. This
10788        parameter allows you to define how the columns from the transcripts table should be transformed
10789        or mapped
10790        :type column_formats: dict
10791        :param temporary_tables: The `temporary_tables` parameter in the
10792        `create_transcript_view_from_column_format` function is a list that stores the names of
10793        temporary views created during the process of creating a transcript view from a column format.
10794        These temporary views are used to manipulate and extract data before generating the final
10795        transcript view
10796        :type temporary_tables: list
10797        :param annotation_fields: The `annotation_fields` parameter in the
10798        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
10799        that are extracted from the temporary views created during the process. These annotation fields
10800        are obtained by querying the temporary views and extracting the column names excluding specific
10801        columns like `#CH
10802        :type annotation_fields: list
10803        :param column_rename: The `column_rename` parameter in the
10804        `create_transcript_view_from_column_format` function is a dictionary that allows you to specify
10805        custom renaming of columns in the transcripts infos table. By providing a mapping of original
10806        column names to new column names in this dictionary, you can rename specific columns during the
10807        process
10808        :type column_rename: dict
10809        :param column_clean: The `column_clean` parameter in the
10810        `create_transcript_view_from_column_format` function is a boolean flag that determines whether
10811        the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns
10812        will be cleaned during the creation of the transcript view based on the specified column format,
10813        defaults to False
10814        :type column_clean: bool (optional)
10815        :param column_case: The `column_case` parameter in the
10816        `create_transcript_view_from_column_format` function is used to specify the case transformation
10817        to be applied to the columns in the transcript view. It can be set to either "upper" or "lower"
10818        to convert the column names to uppercase or lowercase, respectively
10819        :type column_case: str
10820        :return: The `create_transcript_view_from_column_format` function returns two lists:
10821        `temporary_tables` and `annotation_fields`.
10822        """
10823
10824        log.debug("Start transcrpts view creation from column format...")
10825
10826        #  "from_column_format": [
10827        #     {
10828        #         "transcripts_column": "ANN",
10829        #         "transcripts_infos_column": "Feature_ID",
10830        #     }
10831        # ],
10832
10833        # Init
10834        if temporary_tables is None:
10835            temporary_tables = []
10836        if annotation_fields is None:
10837            annotation_fields = []
10838
10839        added_columns = []
10840
10841        for column_format in column_formats:
10842
10843            # annotation field and transcript annotation field
10844            annotation_field = column_format.get("transcripts_column", "ANN")
10845            transcript_annotation = column_format.get(
10846                "transcripts_infos_column", "Feature_ID"
10847            )
10848
10849            # Transcripts infos columns rename
10850            column_rename = column_format.get("column_rename", column_rename)
10851
10852            # Transcripts infos columns clean
10853            column_clean = column_format.get("column_clean", column_clean)
10854
10855            # Transcripts infos columns case
10856            column_case = column_format.get("column_case", column_case)
10857
10858            # Temporary View name
10859            temporary_view_name = transcripts_table + "".join(
10860                random.choices(string.ascii_uppercase + string.digits, k=10)
10861            )
10862
10863            # Create temporary view name
10864            temporary_view_name, added_columns = self.annotation_format_to_table(
10865                annotation_field=annotation_field,
10866                view_name=temporary_view_name,
10867                annotation_id=transcript_annotation,
10868                column_rename=column_rename,
10869                column_clean=column_clean,
10870                column_case=column_case,
10871            )
10872
10873            # Annotation fields
10874            if temporary_view_name:
10875                query_annotation_fields = f"""
10876                    SELECT *
10877                    FROM (
10878                        DESCRIBE SELECT *
10879                        FROM {temporary_view_name}
10880                        )
10881                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
10882                """
10883                df_annotation_fields = self.get_query_to_df(
10884                    query=query_annotation_fields
10885                )
10886
10887                # Add temporary view and annotation fields
10888                temporary_tables.append(temporary_view_name)
10889                annotation_fields += list(set(df_annotation_fields["column_name"]))
10890
10891        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
  • column_rename: The column_rename parameter in the create_transcript_view_from_column_format function is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process
  • column_clean: The column_clean parameter in the create_transcript_view_from_column_format function is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set to True, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False
  • column_case: The column_case parameter in the create_transcript_view_from_column_format function is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = False, param: dict = {}) -> str:
10893    def create_transcript_view(
10894        self,
10895        transcripts_table: str = None,
10896        transcripts_table_drop: bool = False,
10897        param: dict = {},
10898    ) -> str:
10899        """
10900        The `create_transcript_view` function generates a transcript view by processing data from a
10901        specified table based on provided parameters and structural information.
10902
10903        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
10904        is used to specify the name of the table that will store the final transcript view data. If a table
10905        name is not provided, the function will create a new table to store the transcript view data, and by
10906        default,, defaults to transcripts
10907        :type transcripts_table: str (optional)
10908        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
10909        `create_transcript_view` function is a boolean parameter that determines whether to drop the
10910        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
10911        the function will drop the existing transcripts table if it exists, defaults to False
10912        :type transcripts_table_drop: bool (optional)
10913        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
10914        contains information needed to create a transcript view. It includes details such as the structure
10915        of the transcripts, columns mapping, column formats, and other necessary information for generating
10916        the view. This parameter allows for flexibility and customization
10917        :type param: dict
10918        :return: The `create_transcript_view` function returns the name of the transcripts table that was
10919        created or modified during the execution of the function.
10920        """
10921
10922        log.debug("Start transcripts view creation...")
10923
10924        # Default
10925        transcripts_table_default = "transcripts"
10926
10927        # Param
10928        if not param:
10929            param = self.get_param()
10930
10931        # Struct
10932        struct = param.get("transcripts", {}).get("struct", None)
10933
10934        # Transcript veresion
10935        transcript_id_remove_version = param.get("transcripts", {}).get(
10936            "transcript_id_remove_version", False
10937        )
10938
10939        # Transcripts mapping
10940        transcript_id_mapping_file = param.get("transcripts", {}).get(
10941            "transcript_id_mapping_file", None
10942        )
10943
10944        # Transcripts mapping
10945        transcript_id_mapping_force = param.get("transcripts", {}).get(
10946            "transcript_id_mapping_force", None
10947        )
10948
10949        # Transcripts table
10950        if transcripts_table is None:
10951            transcripts_table = param.get("transcripts", {}).get(
10952                "table", transcripts_table_default
10953            )
10954
10955        # Check transcripts table exists
10956        if transcripts_table:
10957
10958            # Query to check if transcripts table exists
10959            query_check_table = f"""
10960                SELECT * 
10961                FROM information_schema.tables 
10962                WHERE table_name = '{transcripts_table}'
10963            """
10964            df_check_table = self.get_query_to_df(query=query_check_table)
10965
10966            # Check if transcripts table exists
10967            if len(df_check_table) > 0 and not transcripts_table_drop:
10968                log.debug(f"Table {transcripts_table} exists and not drop option")
10969                return transcripts_table
10970
10971        if struct:
10972
10973            # added_columns
10974            added_columns = []
10975
10976            # Temporary tables
10977            temporary_tables = []
10978
10979            # Annotation fields
10980            annotation_fields = []
10981
10982            # from columns map
10983            columns_maps = struct.get("from_columns_map", [])
10984            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10985                self.create_transcript_view_from_columns_map(
10986                    transcripts_table=transcripts_table,
10987                    columns_maps=columns_maps,
10988                    added_columns=added_columns,
10989                    temporary_tables=temporary_tables,
10990                    annotation_fields=annotation_fields,
10991                )
10992            )
10993            added_columns += added_columns_tmp
10994            temporary_tables += temporary_tables_tmp
10995            annotation_fields += annotation_fields_tmp
10996
10997            # from column format
10998            column_formats = struct.get("from_column_format", [])
10999            added_columns, temporary_tables_tmp, annotation_fields_tmp = (
11000                self.create_transcript_view_from_column_format(
11001                    transcripts_table=transcripts_table,
11002                    column_formats=column_formats,
11003                    temporary_tables=temporary_tables,
11004                    annotation_fields=annotation_fields,
11005                )
11006            )
11007            added_columns += added_columns_tmp
11008            temporary_tables += temporary_tables_tmp
11009            annotation_fields += annotation_fields_tmp
11010
11011            # Remove some specific fields/column
11012            annotation_fields = list(set(annotation_fields))
11013            for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]:
11014                if field in annotation_fields:
11015                    annotation_fields.remove(field)
11016
11017            # Merge temporary tables query
11018            query_merge = ""
11019            for temporary_table in list(set(temporary_tables)):
11020
11021                # First temporary table
11022                if not query_merge:
11023                    query_merge = f"""
11024                        SELECT * FROM {temporary_table}
11025                    """
11026                # other temporary table (using UNION)
11027                else:
11028                    query_merge += f"""
11029                        UNION BY NAME SELECT * FROM {temporary_table}
11030                    """
11031
11032            # transcript table tmp
11033            transcript_table_tmp = "transcripts_tmp"
11034            transcript_table_tmp2 = "transcripts_tmp2"
11035            transcript_table_tmp3 = "transcripts_tmp3"
11036
11037            # Merge on transcript
11038            query_merge_on_transcripts_annotation_fields = []
11039
11040            # Add transcript list
11041            query_merge_on_transcripts_annotation_fields.append(
11042                f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
11043            )
11044
11045            # Aggregate all annotations fields
11046            for annotation_field in set(annotation_fields):
11047                query_merge_on_transcripts_annotation_fields.append(
11048                    f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
11049                )
11050
11051            # Transcripts mapping
11052            if transcript_id_mapping_file:
11053
11054                # Transcript dataframe
11055                transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
11056                transcript_id_mapping_dataframe = transcripts_file_to_df(
11057                    transcript_id_mapping_file, column_names=["transcript", "alias"]
11058                )
11059
11060                # Transcript version remove
11061                if transcript_id_remove_version:
11062                    query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
11063                    query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
11064                    query_left_join = f"""
11065                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11066                    """
11067                else:
11068                    query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
11069                    query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
11070                    query_left_join = f"""
11071                        LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
11072                    """
11073
11074                # Transcript column for group by merge
11075                query_transcript_merge_group_by = """
11076                        CASE
11077                            WHEN transcript_mapped NOT IN ('')
11078                            THEN split_part(transcript_mapped, '.', 1)
11079                            ELSE split_part(transcript_original, '.', 1)
11080                        END
11081                    """
11082
11083                # Merge query
11084                transcripts_tmp2_query = f"""
11085                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
11086                    FROM ({query_merge}) AS {transcript_table_tmp}
11087                    {query_left_join}
11088                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
11089                """
11090
11091                # Retrive columns after mege
11092                transcripts_tmp2_describe_query = f"""
11093                    DESCRIBE {transcripts_tmp2_query}
11094                """
11095                transcripts_tmp2_describe_list = list(
11096                    self.get_query_to_df(query=transcripts_tmp2_describe_query)[
11097                        "column_name"
11098                    ]
11099                )
11100
11101                # Create list of columns for select clause
11102                transcripts_tmp2_describe_select_clause = []
11103                for field in transcripts_tmp2_describe_list:
11104                    if field not in [
11105                        "#CHROM",
11106                        "POS",
11107                        "REF",
11108                        "ALT",
11109                        "INFO",
11110                        "transcript_mapped",
11111                    ]:
11112                        as_field = field
11113                        if field in ["transcript_original"]:
11114                            as_field = "transcripts_mapped"
11115                        transcripts_tmp2_describe_select_clause.append(
11116                            f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
11117                        )
11118
11119                # Merge with mapping
11120                query_merge_on_transcripts = f"""
11121                    SELECT
11122                        "#CHROM", POS, REF, ALT, INFO,
11123                        CASE
11124                            WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
11125                            THEN ANY_VALUE(transcript_mapped)
11126                            ELSE ANY_VALUE(transcript_original)
11127                        END AS transcript,
11128                        {", ".join(transcripts_tmp2_describe_select_clause)}
11129                    FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
11130                    GROUP BY "#CHROM", POS, REF, ALT, INFO,
11131                        {query_transcript_merge_group_by}
11132                """
11133
11134                # Add transcript filter from mapping file
11135                if transcript_id_mapping_force:
11136                    query_merge_on_transcripts = f"""
11137                        SELECT *
11138                        FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
11139                        WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
11140                    """
11141
11142            # No transcript mapping
11143            else:
11144
11145                # Remove transcript version
11146                if transcript_id_remove_version:
11147                    query_transcript_column = f"""
11148                        split_part({transcript_table_tmp}.transcript, '.', 1)
11149                    """
11150                else:
11151                    query_transcript_column = """
11152                        transcript
11153                    """
11154
11155                # Query sections
11156                query_transcript_column_select = (
11157                    f"{query_transcript_column} AS transcript"
11158                )
11159                query_transcript_column_group_by = query_transcript_column
11160
11161                # Query for transcripts view
11162                query_merge_on_transcripts = f"""
11163                    SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
11164                    FROM ({query_merge}) AS {transcript_table_tmp}
11165                    GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
11166                """
11167
11168            # Drop transcript view is necessary
11169            if transcripts_table_drop:
11170                query_drop = f"""
11171                    DROP TABLE IF EXISTS {transcripts_table};
11172                """
11173                self.execute_query(query=query_drop)
11174
11175            # List of unique #CHROM
11176            query_unique_chrom = f"""
11177                SELECT DISTINCT "#CHROM"
11178                FROM variants AS subquery
11179            """
11180            unique_chroms = self.get_query_to_df(query=query_unique_chrom)
11181
11182            # Create table with structure but without data, if not exists
11183            query_create_table = f"""
11184                CREATE TABLE IF NOT EXISTS {transcripts_table} AS
11185                SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0
11186            """
11187            self.execute_query(query=query_create_table)
11188
11189            # Process by #CHROM
11190            for chrom in unique_chroms["#CHROM"]:
11191
11192                # Log
11193                log.debug(f"Processing #CHROM={chrom}")
11194
11195                # Select data by #CHROM
11196                query_chunk = f"""
11197                    SELECT *
11198                    FROM ({query_merge_on_transcripts})
11199                    WHERE "#CHROM" = '{chrom}'
11200                """
11201
11202                # Insert data
11203                query_insert_chunk = f"""
11204                    INSERT INTO {transcripts_table}
11205                    {query_chunk}
11206                """
11207                self.execute_query(query=query_insert_chunk)
11208
11209            # Remove temporary tables
11210            if temporary_tables:
11211                for temporary_table in list(set(temporary_tables)):
11212                    try:
11213                        query_drop_tmp_table = f"""
11214                            DROP TABLE IF EXISTS {temporary_table}
11215                        """
11216                        self.execute_query(query=query_drop_tmp_table)
11217                    except Exception as e:
11218                        log.debug(f"'{temporary_table}' Not a table")
11219                    try:
11220                        query_drop_tmp_table = f"""
11221                            DROP VIEW IF EXISTS {temporary_table}
11222                        """
11223                        self.execute_query(query=query_drop_tmp_table)
11224                    except Exception as e:
11225                        log.debug(f"'{temporary_table}' Not a view")
11226
11227            # Remove added columns
11228            for added_column in added_columns:
11229                self.drop_column(column=added_column)
11230
11231        else:
11232
11233            transcripts_table = None
11234
11235        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to False
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts', column_rename: dict = {}, column_clean: bool = False, column_case: str = None) -> str:
11237    def annotation_format_to_table(
11238        self,
11239        annotation_field: str = "ANN",
11240        annotation_id: str = "Feature_ID",
11241        view_name: str = "transcripts",
11242        column_rename: dict = {},
11243        column_clean: bool = False,
11244        column_case: str = None,
11245    ) -> str:
11246        """
11247        The `annotation_format_to_table` function converts annotation data from a VCF file into a
11248        structured table format, ensuring unique values and creating a temporary table for further
11249        processing or analysis.
11250
11251        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure
11252        unique values in the output or not. If set to `True`, the function will make sure that the
11253        output values are unique, defaults to True
11254        :type uniquify: bool (optional)
11255        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file
11256        that contains the annotation information for each variant. This field is used to extract the
11257        annotation details for further processing in the function. By default, it is set to "ANN",
11258        defaults to ANN
11259        :type annotation_field: str (optional)
11260        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method
11261        is used to specify the identifier for the annotation feature. This identifier will be used as a
11262        column name in the resulting table or view that is created based on the annotation data. It
11263        helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
11264        :type annotation_id: str (optional)
11265        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used
11266        to specify the name of the temporary table that will be created to store the transformed
11267        annotation data. This table will hold the extracted information from the annotation field in a
11268        structured format for further processing or analysis. By default,, defaults to transcripts
11269        :type view_name: str (optional)
11270        :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method
11271        is a dictionary that allows you to specify custom renaming for columns. By providing key-value
11272        pairs in this dictionary, you can rename specific columns in the resulting table or view that is
11273        created based on the annotation data. This feature enables
11274        :type column_rename: dict
11275        :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is
11276        a boolean flag that determines whether the annotation field should undergo a cleaning process.
11277        If set to `True`, the function will clean the annotation field before further processing. This
11278        cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults
11279        to False
11280        :type column_clean: bool (optional)
11281        :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is
11282        used to specify the case transformation to be applied to the column names extracted from the
11283        annotation data. It allows you to set the case of the column names to either lowercase or
11284        uppercase for consistency or other specific requirements during the conversion
11285        :type column_case: str
11286        :return: The function `annotation_format_to_table` is returning the name of the view created,
11287        which is stored in the variable `view_name`.
11288        """
11289
11290        # Transcript annotation
11291        if column_rename:
11292            annotation_id = column_rename.get(annotation_id, annotation_id)
11293
11294        if column_clean:
11295            annotation_id = clean_annotation_field(annotation_id)
11296
11297        # Prefix
11298        prefix = self.get_explode_infos_prefix()
11299        if prefix:
11300            prefix = "INFO/"
11301
11302        # Variants table
11303        table_variants = self.get_table_variants()
11304
11305        # Header
11306        vcf_reader = self.get_header()
11307
11308        # Add columns
11309        added_columns = []
11310
11311        # Explode HGVS field in column
11312        added_columns += self.explode_infos(fields=[annotation_field])
11313
11314        if annotation_field in vcf_reader.infos:
11315
11316            # Extract ANN header
11317            ann_description = vcf_reader.infos[annotation_field].desc
11318            pattern = r"'(.+?)'"
11319            match = re.search(pattern, ann_description)
11320            if match:
11321                ann_header_match = match.group(1).split(" | ")
11322                ann_header = []
11323                ann_header_desc = {}
11324                for i in range(len(ann_header_match)):
11325                    ann_header_info = "".join(
11326                        char for char in ann_header_match[i] if char.isalnum()
11327                    )
11328                    ann_header.append(ann_header_info)
11329                    ann_header_desc[ann_header_info] = ann_header_match[i]
11330                if not ann_header_desc:
11331                    raise ValueError("Invalid header description format")
11332            else:
11333                raise ValueError("Invalid header description format")
11334
11335            # Create dataframe for keys column type
11336            dataframe_annotation_format = self.get_query_to_df(
11337                f""" 
11338                WITH exploded_annotations AS (
11339                    SELECT
11340                        UNNEST(STRING_SPLIT(ANN, ',')) AS annotation
11341                    FROM {table_variants}
11342                ),
11343                split_annotations AS (
11344                    SELECT
11345                        {", ".join([f"SPLIT_PART(annotation, '|', {i+1}) AS '{header}'" for i, header in enumerate(ann_header_desc.values())])},
11346                    FROM exploded_annotations
11347                )
11348                SELECT * FROM split_annotations
11349                LIMIT 1000
11350                """
11351            )
11352
11353            # Init
11354            query_list_keys = []
11355            key_i = 0
11356
11357            for key in dataframe_annotation_format.keys():
11358
11359                # Key
11360                key_i += 1
11361                key_clean = key
11362
11363                # key rename
11364                if column_rename:
11365                    key_clean = column_rename.get(key_clean, key_clean)
11366
11367                # key clean
11368                if column_clean:
11369                    key_clean = clean_annotation_field(key_clean)
11370
11371                # Key case
11372                if column_case:
11373                    if column_case.lower() in ["lower"]:
11374                        key_clean = key_clean.lower()
11375                    elif column_case.lower() in ["upper"]:
11376                        key_clean = key_clean.upper()
11377
11378                # Detect column type
11379                column_type = detect_column_type(dataframe_annotation_format[key])
11380
11381                # Append key to list
11382                query_list_keys.append(
11383                    f""" NULLIF(SPLIT_PART(annotation, '|', {key_i}), '')::{column_type} AS '{prefix}{key_clean}' """
11384                )
11385
11386            # Create temporary table
11387            query_create_view = f"""
11388                CREATE VIEW {view_name} AS (
11389                    WITH exploded_annotations AS (
11390                        SELECT
11391                            "#CHROM",
11392                            POS,
11393                            REF,
11394                            ALT,
11395                            INFO,
11396                            UNNEST(STRING_SPLIT(ANN, ',')) AS annotation
11397                        FROM {table_variants}
11398                    ),
11399                    split_annotations AS (
11400                        SELECT
11401                            "#CHROM",
11402                            POS,
11403                            REF,
11404                            ALT,
11405                            INFO,
11406                            {", ".join(query_list_keys)},
11407                        FROM exploded_annotations
11408                    )
11409                    SELECT *, {annotation_id} AS 'transcript' FROM split_annotations
11410                )
11411            """
11412            log.debug(f"query_create_view: {query_create_view}")
11413            self.execute_query(query=query_create_view)
11414
11415        else:
11416
11417            # Return None
11418            view_name = None
11419
11420        return view_name, added_columns

The annotation_format_to_table function converts annotation data from a VCF file into a structured table format, ensuring unique values and creating a temporary table for further processing or analysis.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts
  • column_rename: The column_rename parameter in the annotation_format_to_table method is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables
  • column_clean: The column_clean parameter in the annotation_format_to_table method is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set to True, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False
  • column_case: The column_case parameter in the annotation_format_to_table method is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
11422    def transcript_view_to_variants(
11423        self,
11424        transcripts_table: str = None,
11425        transcripts_column_id: str = None,
11426        transcripts_info_json: str = None,
11427        transcripts_info_field_json: str = None,
11428        transcripts_info_format: str = None,
11429        transcripts_info_field_format: str = None,
11430        param: dict = {},
11431    ) -> bool:
11432        """
11433        The `transcript_view_to_variants` function updates a variants table with information from
11434        transcripts in JSON format.
11435
11436        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
11437        table containing the transcripts data. If this parameter is not provided, the function will
11438        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
11439        :type transcripts_table: str
11440        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
11441        column in the `transcripts_table` that contains the unique identifier for each transcript. This
11442        identifier is used to match transcripts with variants in the database
11443        :type transcripts_column_id: str
11444        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
11445        of the column in the variants table where the transcripts information will be stored in JSON
11446        format. This parameter allows you to define the column in the variants table that will hold the
11447        JSON-formatted information about transcripts
11448        :type transcripts_info_json: str
11449        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
11450        specify the field in the VCF header that will contain information about transcripts in JSON
11451        format. This field will be added to the VCF header as an INFO field with the specified name
11452        :type transcripts_info_field_json: str
11453        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
11454        format of the information about transcripts that will be stored in the variants table. This
11455        format can be used to define how the transcript information will be structured or displayed
11456        within the variants table
11457        :type transcripts_info_format: str
11458        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
11459        specify the field in the VCF header that will contain information about transcripts in a
11460        specific format. This field will be added to the VCF header as an INFO field with the specified
11461        name
11462        :type transcripts_info_field_format: str
11463        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
11464        that contains various configuration settings related to transcripts. It is used to provide
11465        default values for certain parameters if they are not explicitly provided when calling the
11466        method. The `param` dictionary can be passed as an argument
11467        :type param: dict
11468        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
11469        if the operation is successful and `False` if certain conditions are not met.
11470        """
11471
11472        msg_info_prefix = "Start transcripts view to variants annotations"
11473
11474        log.debug(f"{msg_info_prefix}...")
11475
11476        # Default
11477        transcripts_table_default = "transcripts"
11478        transcripts_column_id_default = "transcript"
11479        transcripts_info_json_default = None
11480        transcripts_info_format_default = None
11481        transcripts_info_field_json_default = None
11482        transcripts_info_field_format_default = None
11483
11484        # Param
11485        if not param:
11486            param = self.get_param()
11487
11488        # Transcripts table
11489        if transcripts_table is None:
11490            transcripts_table = param.get("transcripts", {}).get(
11491                "table", transcripts_table_default
11492            )
11493
11494        # Transcripts column ID
11495        if transcripts_column_id is None:
11496            transcripts_column_id = param.get("transcripts", {}).get(
11497                "column_id", transcripts_column_id_default
11498            )
11499
11500        # Transcripts info json
11501        if transcripts_info_json is None:
11502            transcripts_info_json = param.get("transcripts", {}).get(
11503                "transcripts_info_json", transcripts_info_json_default
11504            )
11505
11506        # Transcripts info field JSON
11507        if transcripts_info_field_json is None:
11508            transcripts_info_field_json = param.get("transcripts", {}).get(
11509                "transcripts_info_field_json", transcripts_info_field_json_default
11510            )
11511        # if transcripts_info_field_json is not None and transcripts_info_json is None:
11512        #     transcripts_info_json = transcripts_info_field_json
11513
11514        # Transcripts info format
11515        if transcripts_info_format is None:
11516            transcripts_info_format = param.get("transcripts", {}).get(
11517                "transcripts_info_format", transcripts_info_format_default
11518            )
11519
11520        # Transcripts info field FORMAT
11521        if transcripts_info_field_format is None:
11522            transcripts_info_field_format = param.get("transcripts", {}).get(
11523                "transcripts_info_field_format", transcripts_info_field_format_default
11524            )
11525        # if (
11526        #     transcripts_info_field_format is not None
11527        #     and transcripts_info_format is None
11528        # ):
11529        #     transcripts_info_format = transcripts_info_field_format
11530
11531        # Variants table
11532        table_variants = self.get_table_variants()
11533
11534        # Check info columns param
11535        if (
11536            transcripts_info_json is None
11537            and transcripts_info_field_json is None
11538            and transcripts_info_format is None
11539            and transcripts_info_field_format is None
11540        ):
11541            return False
11542
11543        # Transcripts infos columns
11544        query_transcripts_infos_columns = f"""
11545            SELECT *
11546            FROM (
11547                DESCRIBE SELECT * FROM {transcripts_table}
11548                )
11549            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
11550        """
11551        transcripts_infos_columns = list(
11552            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
11553        )
11554
11555        # View results
11556        clause_select = []
11557        clause_to_json = []
11558        clause_to_format = []
11559        for field in transcripts_infos_columns:
11560            # Do not consider INFO field for export into fields
11561            if field not in ["INFO"]:
11562                clause_select.append(
11563                    f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
11564                )
11565                clause_to_json.append(f""" '{field}': "{field}" """)
11566                clause_to_format.append(f""" "{field}" """)
11567
11568        # Update
11569        update_set_json = []
11570        update_set_format = []
11571
11572        # VCF header
11573        vcf_reader = self.get_header()
11574
11575        # Transcripts to info column in JSON
11576        if transcripts_info_json:
11577
11578            # Create column on variants table
11579            self.add_column(
11580                table_name=table_variants,
11581                column_name=transcripts_info_json,
11582                column_type="JSON",
11583                default_value=None,
11584                drop=False,
11585            )
11586
11587            # Add header
11588            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
11589                transcripts_info_json,
11590                ".",
11591                "String",
11592                "Transcripts in JSON format",
11593                "unknwon",
11594                "unknwon",
11595                self.code_type_map["String"],
11596            )
11597
11598            # Add to update
11599            update_set_json.append(
11600                f""" {transcripts_info_json}=t.{transcripts_info_json} """
11601            )
11602
11603        # Transcripts to info field in JSON
11604        if transcripts_info_field_json:
11605
11606            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
11607
11608            # Add to update
11609            update_set_json.append(
11610                f""" 
11611                    INFO = concat(
11612                            CASE
11613                                WHEN INFO NOT IN ('', '.')
11614                                THEN INFO
11615                                ELSE ''
11616                            END,
11617                            CASE
11618                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
11619                                THEN concat(
11620                                    ';{transcripts_info_field_json}=',
11621                                    t.{transcripts_info_json}
11622                                )
11623                                ELSE ''
11624                            END
11625                            )
11626                """
11627            )
11628
11629            # Add header
11630            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
11631                transcripts_info_field_json,
11632                ".",
11633                "String",
11634                "Transcripts in JSON format",
11635                "unknwon",
11636                "unknwon",
11637                self.code_type_map["String"],
11638            )
11639
11640        if update_set_json:
11641
11642            # Update query
11643            query_update = f"""
11644                UPDATE {table_variants}
11645                    SET {", ".join(update_set_json)}
11646                FROM
11647                (
11648                    SELECT
11649                        "#CHROM", POS, REF, ALT,
11650                            concat(
11651                            '{{',
11652                            string_agg(
11653                                '"' || "{transcripts_column_id}" || '":' ||
11654                                to_json(json_output)
11655                            ),
11656                            '}}'
11657                            )::JSON AS {transcripts_info_json}
11658                    FROM
11659                        (
11660                        SELECT
11661                            "#CHROM", POS, REF, ALT,
11662                            "{transcripts_column_id}",
11663                            to_json(
11664                                {{{",".join(clause_to_json)}}}
11665                            )::JSON AS json_output
11666                        FROM
11667                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11668                        WHERE "{transcripts_column_id}" IS NOT NULL
11669                        )
11670                    GROUP BY "#CHROM", POS, REF, ALT
11671                ) AS t
11672                WHERE {table_variants}."#CHROM" = t."#CHROM"
11673                    AND {table_variants}."POS" = t."POS"
11674                    AND {table_variants}."REF" = t."REF"
11675                    AND {table_variants}."ALT" = t."ALT"
11676            """
11677
11678            self.execute_query(query=query_update)
11679
11680        # Transcripts to info column in FORMAT
11681        if transcripts_info_format:
11682
11683            # Create column on variants table
11684            self.add_column(
11685                table_name=table_variants,
11686                column_name=transcripts_info_format,
11687                column_type="VARCHAR",
11688                default_value=None,
11689                drop=False,
11690            )
11691
11692            # Add header
11693            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
11694                transcripts_info_format,
11695                ".",
11696                "String",
11697                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11698                "unknwon",
11699                "unknwon",
11700                self.code_type_map["String"],
11701            )
11702
11703            # Add to update
11704            update_set_format.append(
11705                f""" {transcripts_info_format}=t.{transcripts_info_format} """
11706            )
11707
11708        else:
11709
11710            # Set variable for internal queries
11711            transcripts_info_format = "transcripts_info_format"
11712
11713        # Transcripts to info field in JSON
11714        if transcripts_info_field_format:
11715
11716            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
11717
11718            # Add to update
11719            update_set_format.append(
11720                f""" 
11721                    INFO = concat(
11722                            CASE
11723                                WHEN INFO NOT IN ('', '.')
11724                                THEN INFO
11725                                ELSE ''
11726                            END,
11727                            CASE
11728                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
11729                                THEN concat(
11730                                    ';{transcripts_info_field_format}=',
11731                                    t.{transcripts_info_format}
11732                                )
11733                                ELSE ''
11734                            END
11735                            )
11736                """
11737            )
11738
11739            # Add header
11740            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
11741                transcripts_info_field_format,
11742                ".",
11743                "String",
11744                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
11745                "unknwon",
11746                "unknwon",
11747                self.code_type_map["String"],
11748            )
11749
11750        if update_set_format:
11751
11752            # Update query
11753            query_update = f"""
11754                UPDATE {table_variants}
11755                    SET {", ".join(update_set_format)}
11756                FROM
11757                (
11758                    SELECT
11759                        "#CHROM", POS, REF, ALT,
11760                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
11761                    FROM 
11762                        (
11763                        SELECT
11764                            "#CHROM", POS, REF, ALT,
11765                            "{transcripts_column_id}",
11766                            concat(
11767                                "{transcripts_column_id}",
11768                                '|',
11769                                {", '|', ".join(clause_to_format)}
11770                            ) AS {transcripts_info_format}
11771                        FROM
11772                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
11773                        )
11774                    GROUP BY "#CHROM", POS, REF, ALT
11775                ) AS t
11776                WHERE {table_variants}."#CHROM" = t."#CHROM"
11777                    AND {table_variants}."POS" = t."POS"
11778                    AND {table_variants}."REF" = t."REF"
11779                    AND {table_variants}."ALT" = t."ALT"
11780            """
11781
11782            self.execute_query(query=query_update)
11783
11784        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.

def rename_info_fields(self, fields_to_rename: dict = None, table: str = None) -> dict:
11786    def rename_info_fields(
11787        self, fields_to_rename: dict = None, table: str = None
11788    ) -> dict:
11789        """
11790        The `rename_info_fields` function renames specified fields in a VCF file header and updates
11791        corresponding INFO fields in the variants table.
11792
11793        :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the
11794        mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary
11795        represent the original field names that need to be renamed, and the corresponding values
11796        represent the new names to which the fields should be
11797        :type fields_to_rename: dict
11798        :param table: The `table` parameter in the `rename_info_fields` function represents the name of
11799        the table in which the variants data is stored. This table contains information about genetic
11800        variants, and the function updates the corresponding INFO fields in this table when renaming
11801        specified fields in the VCF file header
11802        :type table: str
11803        :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains
11804        the original field names as keys and their corresponding new names (or None if the field was
11805        removed) as values after renaming or removing specified fields in a VCF file header and updating
11806        corresponding INFO fields in the variants table.
11807        """
11808
11809        # Init
11810        fields_renamed = {}
11811        config = self.get_config()
11812        access = config.get("access")
11813
11814        if table is None:
11815            table = self.get_table_variants()
11816
11817        # regexp replace fonction
11818        regex_replace_dict = {}
11819        regex_replace_nb = 0
11820        regex_replace_partition = 125
11821        regex_replace = "concat(INFO, ';')"  # Add ';' to reduce regexp comlexity
11822
11823        if fields_to_rename is not None and access not in ["RO"]:
11824
11825            log.info("Rename or remove fields...")
11826
11827            # Header
11828            header = self.get_header()
11829
11830            for field_to_rename, field_renamed in fields_to_rename.items():
11831
11832                if field_to_rename in header.infos:
11833
11834                    # Rename header
11835                    if field_renamed is not None:
11836                        header.infos[field_renamed] = vcf.parser._Info(
11837                            field_renamed,
11838                            header.infos[field_to_rename].num,
11839                            header.infos[field_to_rename].type,
11840                            header.infos[field_to_rename].desc,
11841                            header.infos[field_to_rename].source,
11842                            header.infos[field_to_rename].version,
11843                            header.infos[field_to_rename].type_code,
11844                        )
11845                    del header.infos[field_to_rename]
11846
11847                    # Rename INFO patterns
11848                    field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;"
11849                    if field_renamed is not None:
11850                        field_renamed_pattern = rf"\1{field_renamed}\3;"
11851                    else:
11852                        field_renamed_pattern = r"\1"
11853
11854                    # regexp replace
11855                    regex_replace_nb += 1
11856                    regex_replace_key = math.floor(
11857                        regex_replace_nb / regex_replace_partition
11858                    )
11859                    if (regex_replace_nb % regex_replace_partition) == 0:
11860                        regex_replace = "concat(INFO, ';')"
11861                    regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')"
11862                    regex_replace_dict[regex_replace_key] = regex_replace
11863
11864                    # Return
11865                    fields_renamed[field_to_rename] = field_renamed
11866
11867                    # Log
11868                    if field_renamed is not None:
11869                        log.info(
11870                            f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'"
11871                        )
11872                    else:
11873                        log.info(
11874                            f"Rename or remove fields - field '{field_to_rename}' removed"
11875                        )
11876
11877                else:
11878
11879                    log.warning(
11880                        f"Rename or remove fields - field '{field_to_rename}' not in header"
11881                    )
11882
11883            # Rename INFO
11884            for regex_replace_key, regex_replace in regex_replace_dict.items():
11885                log.info(
11886                    f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..."
11887                )
11888                query = f"""
11889                    UPDATE {table}
11890                    SET
11891                        INFO = regexp_replace({regex_replace}, ';$', '')
11892                """
11893                log.debug(f"query={query}")
11894                self.execute_query(query=query)
11895
11896        return fields_renamed

The rename_info_fields function renames specified fields in a VCF file header and updates corresponding INFO fields in the variants table.

Parameters
  • fields_to_rename: The fields_to_rename parameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be
  • table: The table parameter in the rename_info_fields function represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns

The rename_info_fields function returns a dictionary fields_renamed that contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.

def calculation_rename_info_fields( self, fields_to_rename: dict = None, table: str = None, operation_name: str = 'RENAME_INFO_FIELDS') -> None:
11898    def calculation_rename_info_fields(
11899        self,
11900        fields_to_rename: dict = None,
11901        table: str = None,
11902        operation_name: str = "RENAME_INFO_FIELDS",
11903    ) -> None:
11904        """
11905        The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates
11906        fields to rename and table if provided, and then calls another function to rename the fields.
11907
11908        :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be
11909        renamed in a table. Each key-value pair in the dictionary represents the original field name as
11910        the key and the new field name as the value
11911        :type fields_to_rename: dict
11912        :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to
11913        specify the name of the table for which the fields are to be renamed. It is a string type
11914        parameter
11915        :type table: str
11916        :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields`
11917        method is a string that specifies the name of the operation being performed. In this context, it
11918        is used as a default value for the operation name if not explicitly provided when calling the
11919        function, defaults to RENAME_INFO_FIELDS
11920        :type operation_name: str (optional)
11921        """
11922
11923        # Param
11924        param = self.get_param()
11925
11926        # Get param fields to rename
11927        param_fields_to_rename = (
11928            param.get("calculation", {})
11929            .get("calculations", {})
11930            .get(operation_name, {})
11931            .get("fields_to_rename", None)
11932        )
11933
11934        # Get param table
11935        param_table = (
11936            param.get("calculation", {})
11937            .get("calculations", {})
11938            .get(operation_name, {})
11939            .get("table", None)
11940        )
11941
11942        # Init fields_to_rename
11943        if fields_to_rename is None:
11944            fields_to_rename = param_fields_to_rename
11945
11946        # Init table
11947        if table is None:
11948            table = param_table
11949
11950        renamed_fields = self.rename_info_fields(
11951            fields_to_rename=fields_to_rename, table=table
11952        )
11953
11954        log.debug(f"renamed_fields:{renamed_fields}")

The calculation_rename_info_fields function retrieves parameters from a dictionary, updates fields to rename and table if provided, and then calls another function to rename the fields.

Parameters
  • fields_to_rename: fields_to_rename is a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value
  • table: The table parameter in the calculation_rename_info_fields method is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter
  • operation_name: The operation_name parameter in the calculation_rename_info_fields method is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS
def create_annotations_view( self, table: str = None, view: str = None, view_type: str = None, fields: list = None, prefix: str = '', drop_view: bool = False, fields_to_rename: dict = None, limit: int = None) -> str:
11956    def create_annotations_view(
11957        self,
11958        table: str = None,
11959        view: str = None,
11960        view_type: str = None,
11961        fields: list = None,
11962        prefix: str = "",
11963        drop_view: bool = False,
11964        fields_to_rename: dict = None,
11965        limit: int = None,
11966    ) -> str:
11967        """
11968        The `create_annotations_view` function creates a SQL view from fields in a VCF INFO column.
11969
11970        :param table: The `table` parameter in the `create_annotations_view` function is used to specify
11971        the name of the table from which the fields are to be extracted. This table contains the
11972        variants data, and the function creates a view based on the fields in the INFO column of this
11973        table
11974        :type table: str
11975        :param view: The `view` parameter in the `create_annotations_view` function is used to specify
11976        the name of the view that will be created based on the fields in the VCF INFO column. This view
11977        will contain the extracted fields from the INFO column in a structured format for further
11978        processing or analysis
11979        :type view: str
11980        :param view_type: The `view_type` parameter in the `create_annotations_view` function is used to
11981        specify the type of view that will be created. It can be either a `VIEW` or a `TABLE`, and the
11982        function will create the view based on the specified type
11983        :type view_type: str
11984        :param fields: The `fields` parameter in the `create_annotations_view` function is a list that
11985        contains the names of the fields to be extracted from the INFO column in the VCF file. These
11986        fields will be used to create the view with the specified columns and data extracted from the
11987        INFO column
11988        :type fields: list
11989        :param prefix: The `prefix` parameter in the `create_annotations_view` function is used to
11990        specify a prefix that will be added to the field names in the view. This prefix helps in
11991        distinguishing the fields extracted from the INFO column in the view
11992        :type prefix: str
11993        :param drop_view: The `drop_view` parameter in the `create_annotations_view` function is a boolean
11994        flag that determines whether to drop the existing view with the same name before creating a new
11995        view. If set to `True`, the function will drop the existing view before creating a new view with
11996        the specified name
11997        :type drop_view: bool
11998        :param fields_to_rename: The `fields_to_rename` parameter in the `create_annotations_view`
11999        function is a dictionary that contains the mapping of fields to be renamed in the VCF file. The
12000        keys in the dictionary represent the original field names that need to be renamed, and the
12001        corresponding values represent the new names to which the fields should be
12002        :type fields_to_rename: dict
12003        :param limit: The `limit` parameter in the `create_annotations_view` function is an integer that
12004        specifies the maximum number of rows to be included in the view. If provided, the function will
12005        limit the number of rows in the view to the specified value
12006        :type limit: int
12007        :return: The `create_annotations_view` function returns the name of the view that is created
12008        based on the fields extracted from the INFO column in the VCF file. This view contains the
12009        extracted fields in a structured format for further processing or analysis
12010        """
12011
12012        # Create a sql view from fields in VCF INFO column, with each column is a field present in the VCF header (with a specific type from VCF header) and extracted from INFO column (with a regexp like in rename_info_fields), and each row is a variant.
12013
12014        # Get table
12015        if table is None:
12016            table = self.get_table_variants()
12017
12018        # Get view
12019        if view is None:
12020            view = f"{table}_annotations"
12021
12022        # Get view type
12023        if view_type is None:
12024            view_type = "VIEW"
12025
12026        # Check view type value
12027        if view_type.upper() not in ["VIEW", "TABLE"]:
12028            raise ValueError(
12029                f"Invalid view type value: {view_type}. Either 'VIEW' or 'TABLE'"
12030            )
12031
12032        # Get header
12033        header = self.get_header()
12034
12035        # Get fields
12036        if fields is None:
12037            fields = list(header.infos.keys())
12038
12039        # Get fields to rename
12040        if fields_to_rename is None:
12041            fields_to_rename = {}
12042
12043        log.info(
12044            f"Create '{view}' view (as '{view_type}') from table '{table}' with {len(fields)} fields"
12045        )
12046
12047        # Describe table
12048        table_describe_query = f"""
12049            DESCRIBE {table}
12050        """
12051        table_describe = self.get_query_to_df(query=table_describe_query)
12052
12053        # Create fields for annotation view extracted from INFO column in table variants (with regexp_replace like in rename_info_fields), with column type from VCF header
12054        fields_columns = []
12055        fields_needed = ["#CHROM", "POS", "REF", "ALT"]
12056        field_sql_type_list = False
12057        for field in fields:
12058
12059            # Rename field
12060            field_to_rename = fields_to_rename.get(field, field)
12061
12062            # Check field type
12063
12064            # Needed fields
12065            if field in fields_needed:
12066                continue
12067
12068            # Fields in table
12069            elif field in list(table_describe.get("column_name")):
12070                fields_columns.append(f""" "{field}" AS '{prefix}{field_to_rename}' """)
12071
12072            # Fields in header
12073            elif field in header.infos:
12074
12075                # Field info
12076                field_infos = header.infos.get(field, None)
12077
12078                # Field SQL type
12079                field_sql_type = code_type_map_to_sql.get(field_infos.type, "VARCHAR")
12080
12081                # Column is a list
12082                if field_infos.num != 1:
12083                    field_sql_type_list = True
12084
12085                # Colonne is a flag
12086                if field_infos.type == "Flag":
12087                    field_pattern = rf"(^|;)({field})([^;]*)?"
12088                    fields_columns.append(
12089                        f""" regexp_matches("INFO", '{field_pattern}')::BOOLEAN AS '{prefix}{field_to_rename}' """
12090                    )
12091
12092                # Colonne with a type
12093                else:
12094
12095                    # Field pattern
12096                    field_pattern = rf"(^|;)({field})=([^;]*)?"
12097
12098                    # Field is a list
12099                    if field_sql_type_list:
12100                        fields_columns.append(
12101                            f""" CAST(list_transform(string_split(NULLIF(regexp_extract("INFO", '{field_pattern}', 3), ''), ','), x -> CASE WHEN x = '.' OR x = '' THEN NULL ELSE x END) AS {field_sql_type}[]) AS '{prefix}{field_to_rename}' """
12102                        )
12103
12104                    # Field is a unique value
12105                    else:
12106                        fields_columns.append(
12107                            f""" NULLIF(regexp_replace(regexp_extract("INFO", '{field_pattern}', 3), '^\\.$', ''), '')::{field_sql_type} AS '{prefix}{field_to_rename}' """
12108                        )
12109
12110            else:
12111                fields_columns.append(f""" null AS '{prefix}{field_to_rename}' """)
12112                msg_err = f"Field '{field}' is not found (in table or header): '{field}' will be set to NULL"
12113                log.warning(msg=msg_err)
12114
12115        # Limit
12116        limit_clause = ""
12117        if limit is not None:
12118            limit_clause = f" LIMIT {limit} "
12119
12120        # Query select
12121        query_select = f"""
12122            SELECT
12123                {', '.join([f'"{field}"' for field in fields_needed])}, {", ".join(fields_columns)}
12124            FROM
12125                {table}
12126            {limit_clause}
12127        """
12128
12129        # Drop if any
12130        if drop_view:
12131            log.debug(f"Drop view: {view}")
12132            query_create_view = f"""
12133                DROP {view_type} IF EXISTS {view}
12134            """
12135            self.execute_query(query=query_create_view)
12136            log.debug(f"View dropped: {view}")
12137
12138        # Create view
12139        log.debug(f"Create view: {view}")
12140        query_create_view = f"""
12141            CREATE {view_type} IF NOT EXISTS {view} AS {query_select}
12142        """
12143        # log.debug(f"query_create_view:{query_create_view}")
12144        self.execute_query(query=query_create_view)
12145        log.debug(f"View created: {view}")
12146
12147        return view

The create_annotations_view function creates a SQL view from fields in a VCF INFO column.

Parameters
  • table: The table parameter in the create_annotations_view function is used to specify the name of the table from which the fields are to be extracted. This table contains the variants data, and the function creates a view based on the fields in the INFO column of this table
  • view: The view parameter in the create_annotations_view function is used to specify the name of the view that will be created based on the fields in the VCF INFO column. This view will contain the extracted fields from the INFO column in a structured format for further processing or analysis
  • view_type: The view_type parameter in the create_annotations_view function is used to specify the type of view that will be created. It can be either a VIEW or a TABLE, and the function will create the view based on the specified type
  • fields: The fields parameter in the create_annotations_view function is a list that contains the names of the fields to be extracted from the INFO column in the VCF file. These fields will be used to create the view with the specified columns and data extracted from the INFO column
  • prefix: The prefix parameter in the create_annotations_view function is used to specify a prefix that will be added to the field names in the view. This prefix helps in distinguishing the fields extracted from the INFO column in the view
  • drop_view: The drop_view parameter in the create_annotations_view function is a boolean flag that determines whether to drop the existing view with the same name before creating a new view. If set to True, the function will drop the existing view before creating a new view with the specified name
  • fields_to_rename: The fields_to_rename parameter in the create_annotations_view function is a dictionary that contains the mapping of fields to be renamed in the VCF file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be
  • limit: The limit parameter in the create_annotations_view function is an integer that specifies the maximum number of rows to be included in the view. If provided, the function will limit the number of rows in the view to the specified value
Returns

The create_annotations_view function returns the name of the view that is created based on the fields extracted from the INFO column in the VCF file. This view contains the extracted fields in a structured format for further processing or analysis